лет назад: 12 · 15775902b0
--- a/doc/doxygen/chapters/advanced_examples.doxy
+++ b/doc/doxygen/chapters/advanced_examples.doxy
@@ -6,9 +6,10 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page advancedExamples Advanced Examples
			
 
				+/*! \page AdvancedExamples Advanced Examples
			
 
				+
			
 
				+\section UsingMultipleImplementationsOfACodelet Using Multiple Implementations Of A Codelet
			
 
				 
			
 
				-\section Using_multiple_implementations_of_a_codelet Using multiple implementations of a codelet
			
 
				 One may want to write multiple implementations of a codelet for a single type of
			
 
				 device and let StarPU choose which one to run. As an example, we will show how
			
 
				 to use SSE to scale a vector. The codelet can be written as follows:
			
@@ -47,7 +48,7 @@ Schedulers which are multi-implementation aware (only <c>dmda</c> and
 
				 <c>pheft</c> for now) will use the performance models of all the
			
 
				 implementations it was given, and pick the one that seems to be the fastest.
			
 
				 
			
 
				-\section Enabling_implementation_according_to_capabilities Enabling implementation according to capabilities
			
 
				+\section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
			
 
				 
			
 
				 Some implementations may not run on some devices. For instance, some CUDA
			
 
				 devices do not support double floating point precision, and thus the kernel
			
@@ -128,7 +129,7 @@ struct starpu_codelet cl = {
 
				 Note: the most generic variant should be provided first, as some schedulers are
			
 
				 not able to try the different variants.
			
 
				 
			
 
				-\section Task_and_Worker_Profiling Task and Worker Profiling
			
 
				+\section TaskAndWorkerProfiling Task And Worker Profiling
			
 
				 
			
 
				 A full example showing how to use the profiling API is available in
			
 
				 the StarPU sources in the directory <c>examples/profiling/</c>.
			
@@ -188,7 +189,7 @@ for (worker = 0; worker < starpu_worker_get_count(); worker++)
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-\section Partitioning_Data Partitioning Data
			
 
				+\section PartitioningData Partitioning Data
			
 
				 
			
 
				 An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
			
 
				 
			
@@ -265,7 +266,7 @@ StarPU provides various interfaces and filters for matrices, vectors, etc.,
 
				 but applications can also write their own data interfaces and filters, see
			
 
				 <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
			
 
				 
			
 
				-\section Performance_model_example Performance model example
			
 
				+\section PerformanceModelExample Performance Model Example
			
 
				 
			
 
				 To achieve good scheduling, StarPU scheduling policies need to be able to
			
 
				 estimate in advance the duration of a task. This is done by giving to codelets
			
@@ -291,7 +292,7 @@ and ouput sizes as an index.
 
				 It will also save it in <c>$STARPU_HOME/.starpu/sampling/codelets</c>
			
 
				 for further executions, and can be observed by using the tool
			
 
				 <c>starpu_perfmodel_display</c>, or drawn by using
			
 
				-the tool <c>starpu_perfmodel_plot</c> (\ref Performance_model_calibration).  The
			
 
				+the tool <c>starpu_perfmodel_plot</c> (\ref PerformanceModelCalibration).  The
			
 
				 models are indexed by machine name. To
			
 
				 share the models between machines (e.g. for a homogeneous cluster), use
			
 
				 <c>export STARPU_HOSTNAME=some_global_name</c>. Measurements are only done
			
@@ -326,8 +327,8 @@ struct starpu_codelet cl = {
 
				 </li>
			
 
				 <li>
			
 
				 Measured at runtime and refined by regression (model types
			
 
				-::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED)
			
 
				-model type). This still assumes performance regularity, but works
			
 
				+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This
			
 
				+still assumes performance regularity, but works 
			
 
				 with various data input sizes, by applying regression over observed
			
 
				 execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression
			
 
				 form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
			
@@ -341,19 +342,19 @@ Of course, the application has to issue
 
				 tasks with varying size so that the regression can be computed. StarPU will not
			
 
				 trust the regression unless there is at least 10% difference between the minimum
			
 
				 and maximum observed input size. It can be useful to set the
			
 
				-<c>STARPU_CALIBRATE</c> environment variable to <c>1</c> and run the application
			
 
				-on varying input sizes with <c>STARPU_SCHED</c> set to <c>eager</c> scheduler,
			
 
				+environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
			
 
				+on varying input sizes with \ref STARPU_SCHED set to <c>eager</c> scheduler,
			
 
				 so as to feed the performance model for a variety of
			
 
				 inputs. The application can also provide the measurements explictly by
			
 
				 using the function starpu_perfmodel_update_history(). The tools
			
 
				 <c>starpu_perfmodel_display</c> and <c>starpu_perfmodel_plot</c> can
			
 
				 be used to observe how much the performance model is calibrated (\ref
			
 
				-Performance_model_calibration); when their output look good,
			
 
				-<c>STARPU_CALIBRATE</c> can be reset to <c>0</c> to let
			
 
				+PerformanceModelCalibration); when their output look good,
			
 
				+\ref STARPU_CALIBRATE can be reset to <c>0</c> to let
			
 
				 StarPU use the resulting performance model without recording new measures, and
			
 
				-<c>STARPU_SCHED</c> can be set to <c>dmda</c> to benefit from the performance models. If
			
 
				+\ref STARPU_SCHED can be set to <c>dmda</c> to benefit from the performance models. If
			
 
				 the data input sizes vary a lot, it is really important to set
			
 
				-<c>STARPU_CALIBRATE</c> to <c>0</c>, otherwise StarPU will continue adding the
			
 
				+\ref STARPU_CALIBRATE to <c>0</c>, otherwise StarPU will continue adding the
			
 
				 measures, and result with a very big performance model, which will take time a
			
 
				 lot of time to load and save.
			
 
				 
			
@@ -390,7 +391,7 @@ there is some hidden parameter such as the number of iterations, etc. The
 
				 base.
			
 
				 
			
 
				 How to use schedulers which can benefit from such performance model is explained
			
 
				-in \ref Task_scheduling_policy.
			
 
				+in \ref TaskSchedulingPolicy.
			
 
				 
			
 
				 The same can be done for task power consumption estimation, by setting
			
 
				 the field starpu_codelet::power_model the same way as the field
			
@@ -410,7 +411,7 @@ used to get the footprint used for indexing history-based performance
 
				 models. starpu_task_destroy() needs to be called to destroy the dummy
			
 
				 task afterwards. See <c>tests/perfmodels/regression_based.c</c> for an example.
			
 
				 
			
 
				-\section Theoretical_lower_bound_on_execution_time_example Theoretical lower bound on execution time
			
 
				+\section TheoreticalLowerBoundOnExecutionTimeExample Theoretical Lower Bound On Execution Time Example
			
 
				 
			
 
				 For kernels with history-based performance models (and provided that
			
 
				 they are completely calibrated), StarPU can very easily provide a
			
@@ -459,7 +460,7 @@ the priorities as the StarPU scheduler would, i.e. schedule prioritized
 
				 tasks before less prioritized tasks, to check to which extend this results
			
 
				 to a less optimal solution. This increases even more computation time.
			
 
				 
			
 
				-\section Insert_Task_Utility Insert Task Utility
			
 
				+\section InsertTaskUtility Insert Task Utility
			
 
				 
			
 
				 StarPU provides the wrapper function starpu_insert_task() to ease
			
 
				 the creation and submission of tasks.
			
@@ -529,7 +530,7 @@ starpu_insert_task(&mycodelet,
 
				 If some part of the task insertion depends on the value of some computation,
			
 
				 the macro ::STARPU_DATA_ACQUIRE_CB can be very convenient. For
			
 
				 instance, assuming that the index variable <c>i</c> was registered as handle
			
 
				-<c>i_handle</c>:
			
 
				+<c>A_handle[i]</c>:
			
 
				 
			
 
				 \code{.c}
			
 
				 /* Compute which portion we will work on, e.g. pivot */
			
@@ -549,7 +550,7 @@ be executed, and is allowed to read from <c>i</c> to use it e.g. as an
 
				 index. Note that this macro is only avaible when compiling StarPU with
			
 
				 the compiler <c>gcc</c>.
			
 
				 
			
 
				-\section Data_reduction Data reduction
			
 
				+\section DataReduction Data Reduction
			
 
				 
			
 
				 In various cases, some piece of data is used to accumulate intermediate
			
 
				 results. For instances, the dot product of a vector, maximum/minimum finding,
			
@@ -655,13 +656,13 @@ for (i = 0; i < 100; i++) {
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-\section Temporary_buffers Temporary buffers
			
 
				+\section TemporaryBuffers Temporary Buffers
			
 
				 
			
 
				 There are two kinds of temporary buffers: temporary data which just pass results
			
 
				 from a task to another, and scratch data which are needed only internally by
			
 
				 tasks.
			
 
				 
			
 
				-\subsection Temporary_data Temporary data
			
 
				+\subsection TemporaryData Temporary Data
			
 
				 
			
 
				 Data can sometimes be entirely produced by a task, and entirely consumed by
			
 
				 another task, without the need for other parts of the application to access
			
@@ -688,15 +689,15 @@ starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0
 
				 starpu_data_unregister_submit(handle);
			
 
				 \endcode
			
 
				 
			
 
				-\subsection Scratch_data Scratch data
			
 
				+\subsection ScratchData Scratch Data
			
 
				 
			
 
				 Some kernels sometimes need temporary data to achieve the computations, i.e. a
			
 
				 workspace. The application could allocate it at the start of the codelet
			
 
				 function, and free it at the end, but that would be costly. It could also
			
 
				 allocate one buffer per worker (similarly to \ref
			
 
				-Per-worker_library_initialization), but that would make them
			
 
				-systematic and permanent. A more  optimized way is to use the
			
 
				-::STARPU_SCRATCH data access mode, as examplified below,
			
 
				+HowToInitializeAComputationLibraryOnceForEachWorker), but that would
			
 
				+make them systematic and permanent. A more  optimized way is to use
			
 
				+the ::STARPU_SCRATCH data access mode, as examplified below,
			
 
				 
			
 
				 which provides per-worker buffers without content consistency.
			
 
				 
			
@@ -717,7 +718,7 @@ not matter.
 
				 
			
 
				 The <c>examples/pi</c> example uses scratches for some temporary buffer.
			
 
				 
			
 
				-\section Parallel_Tasks Parallel Tasks
			
 
				+\section ParallelTasks Parallel Tasks
			
 
				 
			
 
				 StarPU can leverage existing parallel computation libraries by the means of
			
 
				 parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
			
@@ -731,7 +732,7 @@ otherwise StarPU will not know how to better group cores.
 
				 
			
 
				 Two modes of execution exist to accomodate with existing usages.
			
 
				 
			
 
				-\subsection Fork-mode_parallel_tasks Fork-mode Parallel Tasks
			
 
				+\subsection Fork-modeParallelTasks Fork-mode Parallel Tasks
			
 
				 
			
 
				 In the Fork mode, StarPU will call the codelet function on one
			
 
				 of the CPUs of the combined worker. The codelet function can use
			
@@ -751,7 +752,7 @@ For instance, using OpenMP (full source is available in
 
				 Other examples include for instance calling a BLAS parallel CPU implementation
			
 
				 (see <c>examples/mult/xgemm.c</c>).
			
 
				 
			
 
				-\subsection SPMD-mode_parallel_tasks SPMD-mode parallel tasks
			
 
				+\subsection SPMD-modeParallelTasks SPMD-mode Parallel Tasks
			
 
				 
			
 
				 In the SPMD mode, StarPU will call the codelet function on
			
 
				 each CPU of the combined worker. The codelet function can use
			
@@ -795,32 +796,34 @@ when the computation to be done is so that threads have to e.g. exchange
 
				 intermediate results, or write to the data in a complex but safe way in the same
			
 
				 buffer.
			
 
				 
			
 
				-\subsection Parallel_tasks_performance Parallel tasks performance
			
 
				+\subsection ParallelTasksPerformance Parallel Tasks Performance
			
 
				 
			
 
				 To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
			
 
				-be used. When exposed to codelets with a Fork or SPMD flag, the <c>pheft</c>
			
 
				-(parallel-heft) and <c>peager</c> (parallel eager) schedulers will indeed also
			
 
				-try to execute tasks with several CPUs. It will automatically try the various
			
 
				-available combined worker sizes (making several measurements for each
			
 
				-worker size) and thus be able to avoid choosing a large combined
			
 
				-worker if the codelet does not actually scale so much.
			
 
				+be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
			
 
				+::STARPU_SPMD, the <c>pheft</c> (parallel-heft) and <c>peager</c>
			
 
				+(parallel eager) schedulers will indeed also try to execute tasks with
			
 
				+several CPUs. It will automatically try the various available combined
			
 
				+worker sizes (making several measurements for each worker size) and
			
 
				+thus be able to avoid choosing a large combined worker if the codelet
			
 
				+does not actually scale so much.
			
 
				 
			
 
				-\subsection Combined_workers Combined workers
			
 
				+\subsection CombinedWorkers Combined Workers
			
 
				 
			
 
				 By default, StarPU creates combined workers according to the architecture
			
 
				-structure as detected by hwloc. It means that for each object of the hwloc
			
 
				+structure as detected by <c>hwloc</c>. It means that for each object of the <c>hwloc</c>
			
 
				 topology (NUMA node, socket, cache, ...) a combined worker will be created. If
			
 
				 some nodes of the hierarchy have a big arity (e.g. many cores in a socket
			
 
				 without a hierarchy of shared caches), StarPU will create combined workers of
			
 
				-intermediate sizes. The <c>STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER</c> variable
			
 
				-permits to tune the maximum arity between levels of combined workers.
			
 
				+intermediate sizes. The variable \ref
			
 
				+STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
			
 
				+arity between levels of combined workers.
			
 
				 
			
 
				 The combined workers actually produced can be seen in the output of the
			
 
				-tool <c>starpu_machine_display</c> (the <c>STARPU_SCHED</c> environment variable
			
 
				-has to be set to a combined worker-aware scheduler such as <c>pheft</c> or
			
 
				-<c>peager</c>).
			
 
				+tool <c>starpu_machine_display</c> (the environment variable \ref
			
 
				+STARPU_SCHED has to be set to a combined worker-aware scheduler such
			
 
				+as <c>pheft</c> or <c>peager</c>).
			
 
				 
			
 
				-\subsection Concurrent_parallel_tasks Concurrent parallel tasks
			
 
				+\subsection ConcurrentParallelTasks Concurrent Parallel Tasks
			
 
				 
			
 
				 Unfortunately, many environments and librairies do not support concurrent
			
 
				 calls.
			
@@ -836,8 +839,8 @@ sections for instance.
 
				 
			
 
				 The solution is then to use only one combined worker at a time.  This can be
			
 
				 done by setting the field starpu_conf::single_combined_worker to 1, or
			
 
				-setting the <c>STARPU_SINGLE_COMBINED_WORKER</c> environment variable
			
 
				-to 1. StarPU will then run only one parallel task at a time (but other 
			
 
				+setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
			
 
				+to 1. StarPU will then run only one parallel task at a time (but other
			
 
				 CPU and GPU tasks are not affected and can be run concurrently). The parallel
			
 
				 task scheduler will however still however still try varying combined worker
			
 
				 sizes to look for the most efficient ones.
			
@@ -845,23 +848,25 @@ sizes to look for the most efficient ones.
 
				 \section Debugging Debugging
			
 
				 
			
 
				 StarPU provides several tools to help debugging aplications. Execution traces
			
 
				-can be generated and displayed graphically, see \ref Generating_traces_with_FxT. Some
			
 
				-gdb helpers are also provided to show the whole StarPU state:
			
 
				+can be generated and displayed graphically, see \ref
			
 
				+GeneratingTracesWithFxT. Some gdb helpers are also provided to show
			
 
				+the whole StarPU state:
			
 
				 
			
 
				 \verbatim
			
 
				 (gdb) source tools/gdbinit
			
 
				 (gdb) help starpu
			
 
				 \endverbatim
			
 
				 
			
 
				-The Temanejo task debugger can also be used, see \ref Using_the_Temanejo_task_debugger.
			
 
				+The Temanejo task debugger can also be used, see \ref UsingTheTemanejoTaskDebugger.
			
 
				+
			
 
				+\section TheMultiformatInterface The Multiformat Interface
			
 
				 
			
 
				-\section The_multiformat_interface The multiformat interface
			
 
				 It may be interesting to represent the same piece of data using two different
			
 
				 data structures: one that would only be used on CPUs, and one that would only
			
 
				 be used on GPUs. This can be done by using the multiformat interface. StarPU
			
 
				 will be able to convert data from one data structure to the other when needed.
			
 
				-Note that the dmda scheduler is the only one optimized for this interface. The
			
 
				-user must provide StarPU with conversion codelets:
			
 
				+Note that the scheduler <c>dmda</c> is the only one optimized for this
			
 
				+interface. The user must provide StarPU with conversion codelets:
			
 
				 
			
 
				 \snippet multiformat.c To be included
			
 
				 
			
@@ -897,9 +902,9 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 
				 
			
 
				 A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
			
 
				 
			
 
				-\section Using_the_Driver_API Using the Driver API
			
 
				+\section UsingTheDriverAPI Using The Driver API
			
 
				 
			
 
				-\ref Running_drivers
			
 
				+\ref API_Running_Drivers
			
 
				 
			
 
				 \code{.c}
			
 
				 int ret;
			
@@ -935,12 +940,12 @@ corresponding driver.
 
				 </li>
			
 
				 </ol>
			
 
				 
			
 
				-\section Defining_a_New_Scheduling_Policy Defining a New Scheduling Policy
			
 
				+\section DefiningANewSchedulingPolicy Defining A New Scheduling Policy
			
 
				 
			
 
				 A full example showing how to define a new scheduling policy is available in
			
 
				 the StarPU sources in the directory <c>examples/scheduler/</c>.
			
 
				 
			
 
				-\ref Scheduling_Policy
			
 
				+See \ref API_Scheduling_Policy
			
 
				 
			
 
				 \code{.c}
			
 
				 static struct starpu_sched_policy dummy_sched_policy = {
			
@@ -958,7 +963,7 @@ static struct starpu_sched_policy dummy_sched_policy = {
 
				 };
			
 
				 \endcode
			
 
				 
			
 
				-\section On-GPU_rendering On-GPU rendering
			
 
				+\section On-GPURendering On-GPU Rendering
			
 
				 
			
 
				 Graphical-oriented applications need to draw the result of their computations,
			
 
				 typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
			
@@ -968,7 +973,7 @@ renderbuffer objects into CUDA.  CUDA however imposes some technical
 
				 constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
			
 
				 to be the one that runs CUDA computations for that GPU.
			
 
				 
			
 
				-To achieve this with StarPU, pass the <c>--disable-cuda-memcpy-peer</c> option
			
 
				+To achieve this with StarPU, pass the option \ref disable-cuda-memcpy-peer
			
 
				 to <c>./configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
			
 
				 first, and the interoperability mode has to
			
 
				 be enabled by using the field
			
@@ -1009,7 +1014,7 @@ starpu_data_unregister(handle);
 
				 
			
 
				 and display it e.g. in the callback function.
			
 
				 
			
 
				-\section Defining_a_New_Data_Interface Defining a New Data Interface
			
 
				+\section DefiningANewDataInterface Defining A New Data Interface
			
 
				 
			
 
				 Let's define a new data interface to manage complex numbers.
			
 
				 
			
@@ -1117,16 +1122,15 @@ void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args
 
				 The whole code for this complex data interface is available in the
			
 
				 directory <c>examples/interface/</c>.
			
 
				 
			
 
				-\section Setting_the_Data_Handles_for_a_Task Setting the Data Handles for a Task
			
 
				+\section SettingTheDataHandlesForATask Setting The Data Handles For A Task
			
 
				 
			
 
				 The number of data a task can manage is fixed by the
			
 
				 <c>STARPU_NMAXBUFS</c> which has a default value which can be changed
			
 
				-through the configure option <c>--enable-maxbuffers</c> (see
			
 
				-@ref{--enable-maxbuffers}).
			
 
				+through the configure option \ref enable-maxbuffers.
			
 
				 
			
 
				 However, it is possible to define tasks managing more data by using
			
 
				-the field <c>dyn_handles</c> when defining a task and the field
			
 
				-<c>dyn_modes</c> when defining the corresponding codelet.
			
 
				+the field starpu_task::dyn_handles when defining a task and the field
			
 
				+starpu_codelet::dyn_modes when defining the corresponding codelet.
			
 
				 
			
 
				 \code{.c}
			
 
				 enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = {
			
@@ -1167,8 +1171,7 @@ starpu_insert_task(&dummy_big_cl,
 
				 The whole code for this complex data interface is available in the
			
 
				 directory <c>examples/basic_examples/dynamic_handles.c</c>.
			
 
				 
			
 
				-\section More_examples More examples
			
 
				-
			
 
				+\section MoreExamples More Examples
			
 
				 
			
 
				 More examples are available in the StarPU sources in the <c>examples/</c>
			
 
				 directory. Simple examples include:
			
@@ -1179,8 +1182,8 @@ directory. Simple examples include:
 
				 <dt> <c>basic_examples/</c> </dt>
			
 
				 <dd>
			
 
				         Simple documented Hello world and vector/scalar product (as
			
 
				-        shown in \ref basicExamples), matrix
			
 
				-        product examples (as shown in \ref Performance_model_example), an example using the blocked matrix data
			
 
				+        shown in \ref BasicExamples), matrix
			
 
				+        product examples (as shown in \ref PerformanceModelExample), an example using the blocked matrix data
			
 
				         interface, an example using the variable data interface, and an example
			
 
				         using different formats on CPUs and GPUs.
			
 
				 </dd>
			
--- a/doc/doxygen/chapters/basic_examples.doxy
+++ b/doc/doxygen/chapters/basic_examples.doxy
@@ -6,15 +6,15 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page basicExamples Basic Examples
			
 
				+/*! \page BasicExamples Basic Examples
			
 
				 
			
 
				-\section Hello_World_using_the_C_Extension Hello World using the C Extension
			
 
				+\section HelloWorldUsingTheCExtension Hello World Using The C Extension
			
 
				 
			
 
				 This section shows how to implement a simple program that submits a task
			
 
				 to StarPU using the StarPU C extension (\ref cExtensions). The complete example, and additional examples,
			
 
				 is available in the <c>gcc-plugin/examples</c> directory of the StarPU
			
 
				 distribution. A similar example showing how to directly use the StarPU's API is shown
			
 
				-in \ref Hello_World_using_StarPU_API.
			
 
				+in \ref HelloWorldUsingStarPUAPI.
			
 
				 
			
 
				 GCC from version 4.5 permit to use the StarPU GCC plug-in (\ref cExtensions). This makes writing a task both simpler and less error-prone.
			
 
				 In a nutshell, all it takes is to declare a task, declare and define its
			
@@ -45,12 +45,12 @@ Hello, world! With x = 42
 
				 As can be seen above, the C extensions allows programmers to
			
 
				 use StarPU tasks by essentially annotating ``regular'' C code.
			
 
				 
			
 
				-\section Hello_World_using_StarPU_API Hello World using StarPU's API
			
 
				+\section HelloWorldUsingStarPUAPI Hello World Using StarPU's API
			
 
				 
			
 
				 This section shows how to achieve the same result as in the previous
			
 
				 section using StarPU's standard C API.
			
 
				 
			
 
				-\subsection Required_Headers Required Headers
			
 
				+\subsection RequiredHeaders Required Headers
			
 
				 
			
 
				 The header starpu.h should be included in any code using StarPU.
			
 
				 
			
@@ -58,7 +58,7 @@ The header starpu.h should be included in any code using StarPU.
 
				 #include <starpu.h>
			
 
				 \endcode
			
 
				 
			
 
				-\subsection Defining_a_Codelet Defining a Codelet
			
 
				+\subsection DefiningACodelet Defining A Codelet
			
 
				 
			
 
				 \code{.c}
			
 
				 struct params
			
@@ -88,7 +88,7 @@ structure is properly initialized to zero, either by using the
 
				 function starpu_codelet_init(), or by letting the
			
 
				 compiler implicitly do it as examplified above.
			
 
				 
			
 
				-The <c>nbuffers</c> field specifies the number of data buffers that are
			
 
				+The field starpu_codelet::nbuffers specifies the number of data buffers that are
			
 
				 manipulated by the codelet: here the codelet does not access or modify any data
			
 
				 that is controlled by our data management library. Note that the argument
			
 
				 passed to the codelet (the field starpu_task::cl_arg) does not count
			
@@ -127,9 +127,9 @@ Be aware that this may be a pointer to a
 
				 if the codelet modifies this buffer, there is no guarantee that the initial
			
 
				 buffer will be modified as well: this for instance implies that the buffer
			
 
				 cannot be used as a synchronization medium. If synchronization is needed, data
			
 
				-has to be registered to StarPU, see \ref Vector_Scaling_Using_StarPU_API.
			
 
				+has to be registered to StarPU, see \ref VectorScalingUsingStarPUAPI.
			
 
				 
			
 
				-\subsection Submitting_a_Task Submitting a Task
			
 
				+\subsection SubmittingATask Submitting A Task
			
 
				 
			
 
				 \code{.c}
			
 
				 void callback_func(void *callback_arg)
			
@@ -180,45 +180,46 @@ submit the task to StarPU.
 
				 not really clear ;)
			
 
				 \endinternal
			
 
				 
			
 
				-The <c>cl</c> field is a pointer to the codelet which the task will
			
 
				+The field starpu_task::cl is a pointer to the codelet which the task will
			
 
				 execute: in other words, the codelet structure describes which computational
			
 
				 kernel should be offloaded on the different architectures, and the task
			
 
				 structure is a wrapper containing a codelet and the piece of data on which the
			
 
				 codelet should operate.
			
 
				 
			
 
				-The optional <c>cl_arg</c> field is a pointer to a buffer (of size
			
 
				-<c>cl_arg_size</c>) with some parameters for the kernel
			
 
				-described by the codelet. For instance, if a codelet implements a computational
			
 
				-kernel that multiplies its input vector by a constant, the constant could be
			
 
				-specified by the means of this buffer, instead of registering it as a StarPU
			
 
				-data. It must however be noted that StarPU avoids making copy whenever possible
			
 
				-and rather passes the pointer as such, so the buffer which is pointed at must
			
 
				-kept allocated until the task terminates, and if several tasks are submitted
			
 
				-with various parameters, each of them must be given a pointer to their own
			
 
				-buffer.
			
 
				+The optional field starpu_task::cl_arg field is a pointer to a buffer
			
 
				+(of size starpu_task::cl_arg_size) with some parameters for the kernel
			
 
				+described by the codelet. For instance, if a codelet implements a
			
 
				+computational kernel that multiplies its input vector by a constant,
			
 
				+the constant could be specified by the means of this buffer, instead
			
 
				+of registering it as a StarPU data. It must however be noted that
			
 
				+StarPU avoids making copy whenever possible and rather passes the
			
 
				+pointer as such, so the buffer which is pointed at must kept allocated
			
 
				+until the task terminates, and if several tasks are submitted with
			
 
				+various parameters, each of them must be given a pointer to their
			
 
				+buffer.	
			
 
				 
			
 
				 Once a task has been executed, an optional callback function is be called.
			
 
				 While the computational kernel could be offloaded on various architectures, the
			
 
				-callback function is always executed on a CPU. The <c>callback_arg</c>
			
 
				-pointer is passed as an argument of the callback. The prototype of a callback
			
 
				-function must be:
			
 
				+callback function is always executed on a CPU. The pointer
			
 
				+starpu_task::callback_arg is passed as an argument of the callback
			
 
				+function. The prototype of a callback function must be:
			
 
				 
			
 
				 \code{.c}
			
 
				 void (*callback_function)(void *);
			
 
				 \endcode
			
 
				 
			
 
				-If the <c>synchronous</c> field is non-zero, task submission will be
			
 
				-synchronous: the function starpu_task_submit() will not return until
			
 
				-the task was executed. Note that the function starpu_shutdown() does
			
 
				-not guarantee that asynchronous tasks have been executed before it
			
 
				-returns, starpu_task_wait_for_all() can be used to that effect, or
			
 
				+If the field starpu_task::synchronous is non-zero, task submission
			
 
				+will be synchronous: the function starpu_task_submit() will not return
			
 
				+until the task was executed. Note that the function starpu_shutdown()
			
 
				+does not guarantee that asynchronous tasks have been executed before
			
 
				+it returns, starpu_task_wait_for_all() can be used to that effect, or
			
 
				 data can be unregistered (starpu_data_unregister()), which will
			
 
				 implicitly wait for all the tasks scheduled to work on it, unless
			
 
				 explicitly disabled thanks to
			
 
				 starpu_data_set_default_sequential_consistency_flag() or
			
 
				 starpu_data_set_sequential_consistency_flag().
			
 
				 
			
 
				-\subsection Execution_of_Hello_World Execution of Hello World
			
 
				+\subsection ExecutionOfHelloWorld Execution Of Hello World
			
 
				 
			
 
				 \verbatim
			
 
				 $ make hello_world
			
@@ -228,7 +229,7 @@ Hello world (params = {1, 2.000000} )
 
				 Callback function (arg 42)
			
 
				 \endverbatim
			
 
				 
			
 
				-\section Vector_Scaling_Using_the_C_Extension Vector Scaling Using the C Extension
			
 
				+\section VectorScalingUsingTheCExtension Vector Scaling Using the C Extension
			
 
				 
			
 
				 The previous example has shown how to submit tasks. In this section,
			
 
				 we show how StarPU tasks can manipulate data.
			
@@ -240,7 +241,7 @@ directory of the StarPU distribution. These extensions map directly
 
				 to StarPU's main concepts: tasks, task implementations for CPU,
			
 
				 OpenCL, or CUDA, and registered data buffers. The standard C version
			
 
				 that uses StarPU's standard C programming interface is given in the
			
 
				-next section (\ref Vector_Scaling_Using_StarPU_API).
			
 
				+next section (\ref VectorScalingUsingStarPUAPI).
			
 
				 
			
 
				 First of all, the vector-scaling task and its simple CPU implementation
			
 
				 has to be defined:
			
@@ -310,7 +311,7 @@ $ gcc `pkg-config starpu-1.1 --cflags` vector_scal.c \
 
				 
			
 
				 And voilà!
			
 
				 
			
 
				-\subsection Adding_an_OpenCL_Task_Implementation Adding an OpenCL Task Implementation
			
 
				+\subsection AddingAnOpenCLTaskImplementation Adding an OpenCL Task Implementation
			
 
				 
			
 
				 Now, this is all fine and great, but you certainly want to take
			
 
				 advantage of these newfangled GPUs that your lab just bought, don't you?
			
@@ -319,7 +320,7 @@ So, let's add an OpenCL implementation of the <c>vector_scal</c> task.
 
				 We assume that the OpenCL kernel is available in a file,
			
 
				 <c>vector_scal_opencl_kernel.cl</c>, not shown here.  The OpenCL task
			
 
				 implementation is similar to that used with the standard C API
			
 
				-(\ref Definition_of_the_OpenCL_Kernel).  It is declared and defined
			
 
				+(\ref DefinitionOfTheOpenCLKernel).  It is declared and defined
			
 
				 in our C file like this:
			
 
				 
			
 
				 \code{.c}
			
@@ -385,7 +386,7 @@ at run-time.  Unfortunately, the <c>vector_scal_opencl</c> above still
 
				 has to go through the common OpenCL boilerplate; in the future,
			
 
				 additional extensions will automate most of it.
			
 
				 
			
 
				-\subsection Adding_a_CUDA_Task_Implementation Adding a CUDA Task Implementation
			
 
				+\subsection AddingACUDATaskImplementation Adding a CUDA Task Implementation
			
 
				 
			
 
				 Adding a CUDA implementation of the task is very similar, except that
			
 
				 the implementation itself is typically written in CUDA, and compiled
			
@@ -400,7 +401,7 @@ extern void vector_scal_cuda (unsigned size, float vector[size],
 
				 
			
 
				 The actual implementation of the CUDA task goes into a separate
			
 
				 compilation unit, in a <c>.cu</c> file.  It is very close to the
			
 
				-implementation when using StarPU's standard C API (\ref Definition_of_the_CUDA_Kernel).
			
 
				+implementation when using StarPU's standard C API (\ref DefinitionOfTheCUDAKernel).
			
 
				 
			
 
				 \code{.c}
			
 
				 /* CUDA implementation of the `vector_scal' task, to be compiled with `nvcc'. */
			
@@ -438,16 +439,15 @@ CPU task implementation can be added.
 
				 For more details on the C extensions provided by StarPU's GCC plug-in,
			
 
				 \ref cExtensions.
			
 
				 
			
 
				-\section Vector_Scaling_Using_StarPU_API Vector Scaling Using StarPU's API
			
 
				+\section VectorScalingUsingStarPUAPI Vector Scaling Using StarPU's API
			
 
				 
			
 
				 This section shows how to achieve the same result as explained in the
			
 
				 previous section using StarPU's standard C API.
			
 
				 
			
 
				 The full source code for
			
 
				-this example is given in @ref{Full source code for the 'Scaling a
			
 
				-Vector' example}.
			
 
				+this example is given in \ref FullSourceCodeVectorScal.
			
 
				 
			
 
				-\subsection Source_Code_of_Vector_Scaling Source Code of Vector Scaling
			
 
				+\subsection SourceCodeOfVectorScaling Source Code of Vector Scaling
			
 
				 
			
 
				 Programmers can describe the data layout of their application so that StarPU is
			
 
				 responsible for enforcing data coherency and availability across the machine.
			
@@ -499,11 +499,11 @@ starpu_task_submit(task);
 
				 
			
 
				 Since the factor is a mere constant float value parameter,
			
 
				 it does not need a preliminary registration, and
			
 
				-can just be passed through the <c>cl_arg</c> pointer like in the previous
			
 
				+can just be passed through the pointer starpu_task::cl_arg like in the previous
			
 
				 example.  The vector parameter is described by its handle.
			
 
				-There are two fields in each element of the <c>buffers</c> array.
			
 
				-<c>handle</c> is the handle of the data, and <c>mode</c> specifies how the
			
 
				-kernel will access the data (::STARPU_R for read-only, ::STARPU_W for
			
 
				+starpu_task::handles should be set with the handles of the data, the
			
 
				+access modes for the data are defined in the field
			
 
				+starpu_codelet::modes (::STARPU_R for read-only, ::STARPU_W for
			
 
				 write-only and ::STARPU_RW for read and write access).
			
 
				 
			
 
				 The definition of the codelet can be written as follows:
			
@@ -532,20 +532,22 @@ struct starpu_codelet cl =
 
				 \endcode
			
 
				 
			
 
				 The first argument is an array that gives
			
 
				-a description of all the buffers passed in the <c>task->handles</c> array. The
			
 
				-size of this array is given by the <c>nbuffers</c> field of the codelet
			
 
				-structure. For the sake of genericity, this array contains pointers to the
			
 
				-different interfaces describing each buffer.  In the case of the <b>vector
			
 
				-interface</b>, the location of the vector (resp. its length) is accessible in the
			
 
				-\<c>ptr<c> (resp. <c>nx</c>) of this array. Since the vector is accessed in a
			
 
				-read-write fashion, any modification will automatically affect future accesses
			
 
				-to this vector made by other tasks.
			
 
				-
			
 
				-The second argument of the <c>scal_cpu_func</c> function contains a pointer to the
			
 
				-parameters of the codelet (given in <c>task->cl_arg</c>), so that we read the
			
 
				-constant factor from this pointer.
			
 
				-
			
 
				-\subsection Execution_of_Vector_Scaling Execution of Vector Scaling
			
 
				+a description of all the buffers passed in the array starpu_task::handles. The
			
 
				+size of this array is given by the field starpu_codelet::nbuffers. For
			
 
				+the sake of genericity, this array contains pointers to the different
			
 
				+interfaces describing each buffer.  In the case of the <b>vector
			
 
				+interface</b>, the location of the vector (resp. its length) is
			
 
				+accessible in the starpu_vector_interface::ptr (resp.
			
 
				+starpu_vector_interface::nx) of this interface. Since the vector is
			
 
				+accessed in a read-write fashion, any modification will automatically
			
 
				+affect future accesses to this vector made by other tasks.
			
 
				+
			
 
				+The second argument of the function <c>scal_cpu_func</c> contains a
			
 
				+pointer to the parameters of the codelet (given in
			
 
				+starpu_task::cl_arg), so that we read the constant factor from this
			
 
				+pointer.
			
 
				+
			
 
				+\subsection ExecutionOfVectorScaling Execution of Vector Scaling
			
 
				 
			
 
				 \verbatim
			
 
				 $ make vector_scal
			
@@ -554,12 +556,12 @@ $ ./vector_scal
 
				 0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				 \endverbatim
			
 
				 
			
 
				-\section Vector_Scaling_on_an_Hybrid_CPU_GPU_Machine Vector Scaling on an Hybrid CPU/GPU Machine
			
 
				+\section VectorScalingOnAnHybridCPUGPUMachine Vector Scaling on an Hybrid CPU/GPU Machine
			
 
				 
			
 
				 Contrary to the previous examples, the task submitted in this example may not
			
 
				 only be executed by the CPUs, but also by a CUDA device.
			
 
				 
			
 
				-\subsection Definition_of_the_CUDA_Kernel Definition of the CUDA Kernel
			
 
				+\subsection DefinitionOfTheCUDAKernel Definition of the CUDA Kernel
			
 
				 
			
 
				 The CUDA implementation can be written as follows. It needs to be compiled with
			
 
				 a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
			
@@ -596,7 +598,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-\subsection Definition_of_the_OpenCL_Kernel Definition of the OpenCL Kernel
			
 
				+\subsection DefinitionOfTheOpenCLKernel Definition of the OpenCL Kernel
			
 
				 
			
 
				 The OpenCL implementation can be written as follows. StarPU provides
			
 
				 tools to compile a OpenCL kernel stored in a file.
			
@@ -611,10 +613,10 @@ __kernel void vector_mult_opencl(int nx, __global float* val, float factor)
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-Contrary to CUDA and CPU, ::TARPU_VECTOR_GET_DEV_HANDLE has to be used,
			
 
				+Contrary to CUDA and CPU, ::STARPU_VECTOR_GET_DEV_HANDLE has to be used,
			
 
				 which returns a <c>cl_mem</c> (which is not a device pointer, but an OpenCL
			
 
				 handle), which can be passed as such to the OpenCL kernel. The difference is
			
 
				-important when using partitioning, see \ref Partitioning_Data.
			
 
				+important when using partitioning, see \ref PartitioningData.
			
 
				 
			
 
				 \code{.c}
			
 
				 #include <starpu.h>
			
@@ -666,18 +668,18 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 \endcode
			
 
				 
			
 
				 
			
 
				-\subsection Definition_of_the_Main_Code Definition of the Main Code
			
 
				+\subsection DefinitionOfTheMainCode Definition of the Main Code
			
 
				 
			
 
				 The CPU implementation is the same as in the previous section.
			
 
				 
			
 
				 Here is the source of the main application. You can notice that the fields
			
 
				-<c>cuda_funcs</c> and <c>opencl_funcs</c> of the codelet are set to
			
 
				+starpu_codelet::cuda_funcs and starpu_codelet::opencl_funcs are set to
			
 
				 define the pointers to the CUDA and OpenCL implementations of the
			
 
				 task.
			
 
				 
			
 
				 \snippet vector_scal_c.c To be included
			
 
				 
			
 
				-\subsection Execution_of_Hybrid_Vector_Scaling Execution of Hybrid Vector Scaling
			
 
				+\subsection ExecutionOfHybridVectorScaling Execution of Hybrid Vector Scaling
			
 
				 
			
 
				 The Makefile given at the beginning of the section must be extended to
			
 
				 give the rules to compile the CUDA source code. Note that the source
			
--- a/doc/doxygen/chapters/building.doxy
+++ b/doc/doxygen/chapters/building.doxy
@@ -6,9 +6,9 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page buildingAndInstalling Building and Installing StarPU
			
 
				+/*! \page BuildingAndInstallingStarPU Building and Installing StarPU
			
 
				 
			
 
				-\section installing_a_binary_package Installing a Binary Package
			
 
				+\section InstallingABinaryPackage Installing a Binary Package
			
 
				 
			
 
				 One of the StarPU developers being a Debian Developer, the packages
			
 
				 are well integrated and very uptodate. To see which packages are
			
@@ -24,13 +24,13 @@ To install what you need, type:
 
				 $ sudo apt-get install libstarpu-1.0 libstarpu-dev
			
 
				 \endverbatim
			
 
				 
			
 
				-\section installing_from_source Installing from Source
			
 
				+\section InstallingFromSource Installing from Source
			
 
				 
			
 
				 StarPU can be built and installed by the standard means of the GNU
			
 
				 autotools. The following chapter is intended to briefly remind how these tools
			
 
				 can be used to install StarPU.
			
 
				 
			
 
				-\subsection optional_dependencies Optional Dependencies
			
 
				+\subsection OptionalDependencies Optional Dependencies
			
 
				 
			
 
				 The <a href="http://www.open-mpi.org/software/hwloc"><c>hwloc</c> topology
			
 
				 discovery library</a> is not mandatory to use StarPU but strongly
			
@@ -39,13 +39,13 @@ performance.  <c>hwloc</c> is available in major free operating system
 
				 distributions, and for most operating systems.
			
 
				 
			
 
				 If <c>hwloc</c> is not available on your system, the option
			
 
				-<c>--without-hwloc</c> should be explicitely given when calling the
			
 
				+\ref without-hwloc should be explicitely given when calling the
			
 
				 <c>configure</c> script. If <c>hwloc</c> is installed with a <c>pkg-config</c> file,
			
 
				 no option is required, it will be detected automatically, otherwise
			
 
				-<c>--with-hwloc=prefix</c> should be used to specify the location
			
 
				-of <c>hwloc</c>.
			
 
				+\ref with-hwloc should be used to specify the location of
			
 
				+<c>hwloc</c>.
			
 
				 
			
 
				-\subsection getting_sources Getting Sources
			
 
				+\subsection GettingSources Getting Sources
			
 
				 
			
 
				 StarPU's sources can be obtained from the <a href="http://runtime.bordeaux.inria.fr/StarPU/files/">download page of
			
 
				 the StarPU website</a>.
			
@@ -73,7 +73,7 @@ are running on Windows, you will probably prefer to use <a href="http://tortoise
 
				 $ svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk StarPU
			
 
				 \endverbatim
			
 
				 
			
 
				-\subsection configuring_starpu Configuring StarPU
			
 
				+\subsection ConfiguringStarPU Configuring StarPU
			
 
				 
			
 
				 Running <c>autogen.sh</c> is not necessary when using the tarball
			
 
				 releases of StarPU.  If you are using the source code from the svn
			
@@ -110,7 +110,7 @@ $ cd build
 
				 $ ../configure
			
 
				 \endverbatim
			
 
				 
			
 
				-\subsection building_starpu Building StarPU
			
 
				+\subsection BuildingStarPU Building StarPU
			
 
				 
			
 
				 \verbatim
			
 
				 $ make
			
@@ -125,7 +125,7 @@ and the result from the main profile is publicly <a href="http://starpu.gforge.i
 
				 $ make check
			
 
				 \endverbatim
			
 
				 
			
 
				-\subsection installing_starpu Installing StarPU
			
 
				+\subsection InstallingStarPU Installing StarPU
			
 
				 
			
 
				 In order to install StarPU at the location that was specified during
			
 
				 configuration:
			
@@ -138,9 +138,9 @@ Libtool interface versioning information are included in
 
				 libraries names (libstarpu-1.0.so, libstarpumpi-1.0.so and
			
 
				 libstarpufft-1.0.so).
			
 
				 
			
 
				-\section setting_up_your_own Code Setting up Your Own Code
			
 
				+\section SettingUpYourOwnCode Setting up Your Own Code
			
 
				 
			
 
				-\subsection setting_flags_for_compiling Setting Flags for Compiling, Linking and Running Applications
			
 
				+\subsection SettingFlagsForCompilingLinkingAndRunningApplications Setting Flags for Compiling, Linking and Running Applications
			
 
				 
			
 
				 StarPU provides a pkg-config executable to obtain relevant compiler
			
 
				 and linker flags.
			
@@ -191,7 +191,7 @@ CFLAGS          +=      $$(pkg-config --cflags starpu-1.1)
 
				 LDFLAGS         +=      $$(pkg-config --libs starpu-1.1)
			
 
				 \endverbatim
			
 
				 
			
 
				-\subsection running_a_basic_starpu_application Running a Basic StarPU Application
			
 
				+\subsection RunningABasicStarPUApplication Running a Basic StarPU Application
			
 
				 
			
 
				 Basic examples using StarPU are built in the directory
			
 
				 <c>examples/basic_examples/</c> (and installed in
			
@@ -206,19 +206,20 @@ AFTER: First element is 3.140000
 
				 
			
 
				 When StarPU is used for the first time, the directory
			
 
				 <c>$STARPU_HOME/.starpu/</c> is created, performance models will be stored in
			
 
				-that directory (@pxref{STARPU_HOME}).
			
 
				+that directory (\ref STARPU_HOME).
			
 
				 
			
 
				 Please note that buses are benchmarked when StarPU is launched for the
			
 
				 first time. This may take a few minutes, or less if <c>hwloc</c> is
			
 
				 installed. This step is done only once per user and per machine.
			
 
				 
			
 
				-\subsection kernel_threads_started_by_starpu Kernel Threads Started by StarPU
			
 
				+\subsection KernelThreadsStartedByStarPU Kernel Threads Started by StarPU
			
 
				 
			
 
				 StarPU automatically binds one thread per CPU core. It does not use
			
 
				 SMT/hyperthreading because kernels are usually already optimized for using a
			
 
				 full core, and using hyperthreading would make kernel calibration rather random.
			
 
				 
			
 
				-Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
			
 
				+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core
			
 
				+per GPU.
			
 
				 
			
 
				 While StarPU tasks are executing, the application is not supposed to do
			
 
				 computations in the threads it starts itself, tasks should be used instead.
			
@@ -226,7 +227,7 @@ computations in the threads it starts itself, tasks should be used instead.
 
				 TODO: add a StarPU function to bind an application thread (e.g. the main thread)
			
 
				 to a dedicated core (and thus disable the corresponding StarPU CPU worker).
			
 
				 
			
 
				-\subsection Enabling_OpenCL Enabling OpenCL
			
 
				+\subsection EnablingOpenCL Enabling OpenCL
			
 
				 
			
 
				 When both CUDA and OpenCL drivers are enabled, StarPU will launch an
			
 
				 OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
			
@@ -255,34 +256,34 @@ so:
 
				 $ STARPU_NCUDA=2 ./application
			
 
				 \endverbatim
			
 
				 
			
 
				-\section benchmarking_starpu Benchmarking StarPU
			
 
				+\section BenchmarkingStarPU Benchmarking StarPU
			
 
				 
			
 
				 Some interesting benchmarks are installed among examples in
			
 
				 <c>$prefix_dir/lib/starpu/examples/</c>. Make sure to try various
			
 
				 schedulers, for instance <c>STARPU_SCHED=dmda</c>.
			
 
				 
			
 
				-\subsection task_size_overhead Task size overhead
			
 
				+\subsection TaskSizeOverhead Task Size Overhead
			
 
				 
			
 
				 This benchmark gives a glimpse into how big a size should be for StarPU overhead
			
 
				 to be low enough.  Run <c>tasks_size_overhead.sh</c>, it will generate a plot
			
 
				 of the speedup of tasks of various sizes, depending on the number of CPUs being
			
 
				 used.
			
 
				 
			
 
				-\subsection data_transfer_latency Data transfer latency
			
 
				+\subsection DataTransferLatency Data Transfer Latency
			
 
				 
			
 
				 <c>local_pingpong</c> performs a ping-pong between the first two CUDA nodes, and
			
 
				 prints the measured latency.
			
 
				 
			
 
				-\subsection matrix_matrix_multiplication Matrix-matrix multiplication
			
 
				+\subsection MatrixMatrixMultiplication Matrix-Matrix Multiplication
			
 
				 
			
 
				 <c>sgemm</c> and <c>dgemm</c> perform a blocked matrix-matrix
			
 
				 multiplication using BLAS and cuBLAS. They output the obtained GFlops.
			
 
				 
			
 
				-\subsection cholesky_factorization Cholesky factorization
			
 
				+\subsection CholeskyFactorization Cholesky Factorization
			
 
				 
			
 
				 <c>cholesky\*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
			
 
				 
			
 
				-\subsection lu_factorization LU factorization
			
 
				+\subsection LUFactorization LU Factorization
			
 
				 
			
 
				 <c>lu\*</c> perform an LU factorization. They use different dependency primitives.
			
 
				 
			
--- a/doc/doxygen/chapters/c_extensions.doxy
+++ b/doc/doxygen/chapters/c_extensions.doxy
@@ -44,7 +44,7 @@ plug-in.  It does not require detailed knowledge of the StarPU library.
 
				 
			
 
				 Note: this is still an area under development and subject to change.
			
 
				 
			
 
				-\section Defining_Tasks Defining Tasks
			
 
				+\section DefiningTasks Defining Tasks
			
 
				 
			
 
				 The StarPU GCC plug-in views tasks as ``extended'' C functions:
			
 
				 
			
@@ -230,7 +230,7 @@ StarPU-GCC code.  For a complete example, see the
 
				 \ref Vector_Scaling_Using_the_C_Extension.
			
 
				 
			
 
				 
			
 
				-\section Synchronization_and_Other_Pragmas Initialization, Termination, and Synchronization
			
 
				+\section InitializationTerminationAndSynchronization Initialization, Termination, and Synchronization
			
 
				 
			
 
				 The following pragmas allow user code to control StarPU's life time and
			
 
				 to synchronize with tasks.
			
@@ -257,7 +257,7 @@ starpu_wait_for_all().
 
				 </dd>
			
 
				 </dl>
			
 
				 
			
 
				-\section Registered_Data_Buffers Registered Data Buffers
			
 
				+\section RegisteredDataBuffers Registered Data Buffers
			
 
				 
			
 
				 Data buffers such as matrices and vectors that are to be passed to tasks
			
 
				 must be registered.  Registration allows StarPU to handle data
			
@@ -324,7 +324,7 @@ attribute:
 
				 
			
 
				 \snippet cholesky_pragma.c To be included
			
 
				 
			
 
				-\section Conditional_Extensions Using C Extensions Conditionally
			
 
				+\section UsingCExtensionsConditionally Using C Extensions Conditionally
			
 
				 
			
 
				 The C extensions described in this chapter are only available when GCC
			
 
				 and its StarPU plug-in are in use.  Yet, it is possible to make use of
			
--- a/doc/doxygen/chapters/configure_options.doxy
+++ b/doc/doxygen/chapters/configure_options.doxy
@@ -6,7 +6,7 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page ConfigureOptions Compilation Configuration
			
 
				+/*! \page CompilationConfiguration Compilation Configuration
			
 
				 
			
 
				 The behavior of the StarPU library and tools may be tuned thanks to
			
 
				 the following configure options.
			
@@ -101,13 +101,13 @@ Specify hwloc should not be used by StarPU.
 
				 Disable the creation of the documentation. This should be done on a
			
 
				 machine which does not have the tools <c>makeinfo</c> and <c>tex</c>.
			
 
				 </dd>
			
 
				+</dl>
			
 
				 
			
 
				 Additionally, the script <c>configure</c> recognize many variables, which
			
 
				 can be listed by typing <c>./configure --help</c>. For example,
			
 
				 <c>./configure NVCCFLAGS="-arch sm_13"</c> adds a flag for the compilation of
			
 
				 CUDA kernels.
			
 
				 
			
 
				-</dl>
			
 
				 
			
 
				 \section ConfiguringWorkers Configuring Workers
			
 
				 
			
@@ -157,7 +157,7 @@ Search for CUDA under <c>prefix</c>, which should notably contain the file
 
				 \addindex __configure__--with-cuda-include-dir
			
 
				 Search for CUDA headers under <c>dir</c>, which should
			
 
				 notably contain the file <c>cuda.h</c>. This defaults to
			
 
				-<c>/include</c> appended to the value given to \ref --with-cuda-dir.
			
 
				+<c>/include</c> appended to the value given to \ref with-cuda-dir.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>--with-cuda-lib-dir=<c>dir</c></dt>
			
@@ -166,7 +166,7 @@ notably contain the file <c>cuda.h</c>. This defaults to
 
				 \addindex __configure__--with-cuda-lib-dir
			
 
				 Search for CUDA libraries under <c>dir</c>, which should notably contain
			
 
				 the CUDA shared libraries---e.g., <c>libcuda.so</c>.  This defaults to
			
 
				-<c>/lib</c> appended to the value given to \ref --with-cuda-dir.
			
 
				+<c>/lib</c> appended to the value given to \ref with-cuda-dir.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>--disable-cuda-memcpy-peer</dt>
			
@@ -206,7 +206,7 @@ on Mac OS).
 
				 \addindex __configure__--with-opencl-include-dir
			
 
				 Search for OpenCL headers under <c>dir</c>, which should notably contain
			
 
				 <c>CL/cl.h</c> (or <c>OpenCL/cl.h</c> on Mac OS).  This defaults to
			
 
				-<c>/include</c> appended to the value given to \ref --with-opencl-dir.
			
 
				+<c>/include</c> appended to the value given to \ref with-opencl-dir.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>--with-opencl-lib-dir=<c>dir</c></dt>
			
@@ -215,7 +215,7 @@ Search for OpenCL headers under <c>dir</c>, which should notably contain
 
				 \addindex __configure__--with-opencl-lib-dir
			
 
				 Search for an OpenCL library under <c>dir</c>, which should notably
			
 
				 contain the OpenCL shared libraries---e.g. <c>libOpenCL.so</c>. This defaults to
			
 
				-<c>/lib</c> appended to the value given to \ref --with-opencl-dir.
			
 
				+<c>/lib</c> appended to the value given to \ref with-opencl-dir.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>--enable-opencl-simulator</dt>
			
--- a/doc/doxygen/chapters/environment_variables.doxy
+++ b/doc/doxygen/chapters/environment_variables.doxy
@@ -6,12 +6,12 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page EnvironmentVariables Execution Configuration Through Environment Variables
			
 
				+/*! \page ExecutionConfigurationThroughEnvironmentVariables Execution Configuration Through Environment Variables
			
 
				 
			
 
				 The behavior of the StarPU library and tools may be tuned thanks to
			
 
				 the following environment variables.
			
 
				 
			
 
				-\section ConfiguringWorkers Configuring Workers
			
 
				+\section ConfiguringWorkers ConfiguringWorkers
			
 
				 
			
 
				 <dl>
			
 
				 
			
@@ -227,7 +227,7 @@ instead. This permits to test the performance effect of GPU-Direct.
 
				 
			
 
				 </dl>
			
 
				 
			
 
				-\section ConfiguringtheSchedulingengine Configuring the Scheduling engine
			
 
				+\section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
			
 
				 
			
 
				 <dl>
			
 
				 
			
@@ -347,7 +347,7 @@ to 0. It is enabled by default or for any other values of the variable
 
				 
			
 
				 </dl>
			
 
				 
			
 
				-\section Miscellaneousanddebug Miscellaneous and debug
			
 
				+\section MiscellaneousAndDebug Miscellaneous And Debug
			
 
				 
			
 
				 <dl>
			
 
				 
			
--- a/doc/doxygen/chapters/fft_support.doxy
+++ b/doc/doxygen/chapters/fft_support.doxy
@@ -6,7 +6,7 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page fftSupport StarPU FFT Support
			
 
				+/*! \page FFTSupport FFT Support
			
 
				 
			
 
				 StarPU provides <c>libstarpufft</c>, a library whose design is very similar to
			
 
				 both fftw and cufft, the difference being that it takes benefit from both CPUs
			
--- a/doc/doxygen/chapters/introduction.doxy
+++ b/doc/doxygen/chapters/introduction.doxy
@@ -8,7 +8,7 @@
 
				 
			
 
				 /*! \mainpage Introduction
			
 
				 
			
 
				-\section motivation Motivation
			
 
				+\section Motivation Motivation
			
 
				 
			
 
				 \internal
			
 
				 complex machines with heterogeneous cores/devices
			
@@ -38,7 +38,7 @@ added value/benefits of StarPU
 
				    - scheduling, perf. portability
			
 
				 \endinternal
			
 
				 
			
 
				-\section starpu_in_a_nutshell StarPU in a Nutshell
			
 
				+\section StarPUInANutshell StarPU in a Nutshell
			
 
				 
			
 
				 StarPU is a software tool aiming to allow programmers to exploit the
			
 
				 computing power of the available CPUs and GPUs, while relieving them
			
@@ -49,7 +49,7 @@ At the core of StarPU is its run-time support library, which is
 
				 responsible for scheduling application-provided tasks on heterogeneous
			
 
				 CPU/GPU machines.  In addition, StarPU comes with programming language
			
 
				 support, in the form of extensions to languages of the C family
			
 
				-(\ref cExtensions), as well as an OpenCL front-end (\ref soclOpenclExtensions).
			
 
				+(\ref cExtensions), as well as an OpenCL front-end (\ref SOCLOpenclExtensions).
			
 
				 
			
 
				 StarPU's run-time and programming language extensions support a
			
 
				 task-based programming model. Applications submit computational
			
@@ -60,9 +60,10 @@ accelerators and the main memory, so that programmers are freed from the
 
				 scheduling issues and technical details associated with these transfers.
			
 
				 
			
 
				 StarPU takes particular care of scheduling tasks efficiently, using
			
 
				-well-known algorithms from the literature (\ref Task_scheduling_policy).  In addition, it allows scheduling experts, such as compiler
			
 
				-or computational library developers, to implement custom scheduling
			
 
				-policies in a portable fashion (\ref Defining_a_New_Scheduling_Policy).
			
 
				+well-known algorithms from the literature (\ref TaskSchedulingPolicy).
			
 
				+In addition, it allows scheduling experts, such as compiler or
			
 
				+computational library developers, to implement custom scheduling
			
 
				+policies in a portable fashion (\ref DefiningANewSchedulingPolicy).
			
 
				 
			
 
				 The remainder of this section describes the main concepts used in StarPU.
			
 
				 
			
@@ -70,7 +71,7 @@ The remainder of this section describes the main concepts used in StarPU.
 
				 explain the notion of codelet and task (i.e. g(A, B)
			
 
				 \endinternal
			
 
				 
			
 
				-\subsection codelet_and_tasks Codelet and Tasks
			
 
				+\subsection CodeletAndTasks Codelet and Tasks
			
 
				 
			
 
				 One of the StarPU primary data structures is the \b codelet. A codelet describes a
			
 
				 computational kernel that can possibly be implemented on multiple architectures
			
@@ -109,7 +110,7 @@ TODO insert illustration f(Ar, Brw, Cr) + ..
 
				 DSM
			
 
				 \endinternal
			
 
				 
			
 
				-\subsection StarPU_Data_Management_Library StarPU Data Management Library
			
 
				+\subsection StarPUDataManagementLibrary StarPU Data Management Library
			
 
				 
			
 
				 Because StarPU schedules tasks at runtime, data transfers have to be
			
 
				 done automatically and ``just-in-time'' between processing units,
			
@@ -119,7 +120,7 @@ where it was last needed, even if was modified there, and it
 
				 allows multiple copies of the same data to reside at the same time on
			
 
				 several processing units as long as it is not modified.
			
 
				 
			
 
				-\section application_taskification Application taskification
			
 
				+\section ApplicationTaskification Application Taskification
			
 
				 
			
 
				 TODO
			
 
				 
			
@@ -133,7 +134,7 @@ and then it's trivial to use StarPU or any other kind of task-based library:
 
				 simply replace calling the function with submitting a task.
			
 
				 \endinternal
			
 
				 
			
 
				-\section glossary Glossary
			
 
				+\section Glossary Glossary
			
 
				 
			
 
				 A \b codelet records pointers to various implementations of the same
			
 
				 theoretical function.
			
@@ -185,7 +186,7 @@ access the data of a data handle from its home node, without having to
 
				 unregister it.
			
 
				 
			
 
				 
			
 
				-\section research_papers Research Papers
			
 
				+\section ResearchPapers Research Papers
			
 
				 
			
 
				 Research papers about StarPU can be found at
			
 
				 http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html.
			
@@ -193,23 +194,23 @@ http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html.
 
				 A good overview is available in the research report at
			
 
				 http://hal.archives-ouvertes.fr/inria-00467677.
			
 
				 
			
 
				-\section Further_Reading Further Reading
			
 
				+\section FurtherReading Further Reading
			
 
				 
			
 
				 The documentation chapters include
			
 
				 
			
 
				 <ul>
			
 
				-<li> \ref buildingAndInstalling
			
 
				-<li> \ref basicExamples
			
 
				-<li> \ref advancedExamples
			
 
				-<li> \ref optimizePerformance
			
 
				-<li> \ref performanceFeedback
			
 
				-<li> \ref tipsTricks
			
 
				-<li> \ref mpiSupport
			
 
				-<li> \ref fftSupport
			
 
				+<li> \ref BuildingAndInstallingStarPU
			
 
				+<li> \ref BasicExamples
			
 
				+<li> \ref AdvancedExamples
			
 
				+<li> \ref HowToOptimizePerformanceWithStarPU
			
 
				+<li> \ref PerformanceFeedback
			
 
				+<li> \ref TipsAndTricksToKnowAbout
			
 
				+<li> \ref MPISupport
			
 
				+<li> \ref FFTSupport
			
 
				 <li> \ref cExtensions
			
 
				-<li> \ref soclOpenclExtensions
			
 
				-<li> \ref schedulingContexts
			
 
				-<li> \ref schedulingContextHypervisor
			
 
				+<li> \ref SOCLOpenclExtensions
			
 
				+<li> \ref SchedulingContexts
			
 
				+<li> \ref SchedulingContextHypervisor
			
 
				 </ul>
			
 
				 
			
 
				 Make sure to have had a look at those too!
			
--- a/doc/doxygen/chapters/mpi_support.doxy
+++ b/doc/doxygen/chapters/mpi_support.doxy
@@ -6,13 +6,13 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page mpiSupport StarPU MPI Support
			
 
				+/*! \page MPISupport MPI Support
			
 
				 
			
 
				 The integration of MPI transfers within task parallelism is done in a
			
 
				 very natural way by the means of asynchronous interactions between the
			
 
				 application and StarPU.  This is implemented in a separate libstarpumpi library
			
 
				 which basically provides "StarPU" equivalents of <c>MPI_*</c> functions, where
			
 
				-<c>void *</c> buffers are replaced with starpu_data_handle_t, and all
			
 
				+<c>void *</c> buffers are replaced with ::starpu_data_handle_t, and all
			
 
				 GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
			
 
				 use the usual <c>mpirun</c> command of the MPI implementation to start StarPU on
			
 
				 the different MPI nodes.
			
@@ -21,7 +21,7 @@ An MPI Insert Task function provides an even more seamless transition to a
 
				 distributed application, by automatically issuing all required data transfers
			
 
				 according to the task graph and an application-provided distribution.
			
 
				 
			
 
				-\section Simple_Example Simple Example
			
 
				+\section SimpleExample Simple Example
			
 
				 
			
 
				 The flags required to compile or link against the MPI layer are
			
 
				 accessible with the following commands:
			
@@ -101,7 +101,7 @@ int main(int argc, char **argv)
 
				     }
			
 
				 \endcode
			
 
				 
			
 
				-\section Point_to_point_communication Point to point communication
			
 
				+\section PointToPointCommunication Point To Point Communication
			
 
				 
			
 
				 The standard point to point communications of MPI have been
			
 
				 implemented. The semantic is similar to the MPI one, but adapted to
			
@@ -165,19 +165,20 @@ will arrive just after, so as when the corresponding receive request
 
				 will be submitted by the application, it'll copy this temporary handle
			
 
				 into its one instead of submitting a new StarPU-MPI request.
			
 
				 
			
 
				-\ref Communication gives the list of all the point to point
			
 
				+\ref API_MPI_Support gives the list of all the point to point
			
 
				 communications defined in StarPU-MPI.
			
 
				 
			
 
				-\section Exchanging_User_Defined_Data_Interface Exchanging User Defined Data Interface
			
 
				+\section ExchangingUserDefinedDataInterface Exchanging User Defined Data Interface
			
 
				 
			
 
				 New data interfaces defined as explained in \ref
			
 
				-Defining_a_New_Data_Interface can also be used within StarPU-MPI and
			
 
				-exchanged between nodes. Two functions needs to be defined through
			
 
				-the type starpu_data_interface_ops. The pack function takes a handle
			
 
				-and returns a contiguous memory buffer along with its size where data
			
 
				-to be conveyed to another node should be copied. The reversed
			
 
				-operation is implemented in the unpack function which takes a
			
 
				-contiguous memory buffer and recreates the data handle.
			
 
				+DefiningANewDataInterface can also be used within StarPU-MPI and
			
 
				+exchanged between nodes. Two functions needs to be defined through the
			
 
				+type starpu_data_interface_ops. The function
			
 
				+starpu_data_interface_ops::pack_data takes a handle and returns a
			
 
				+contiguous memory buffer along with its size where data to be conveyed
			
 
				+to another node should be copied. The reversed operation is
			
 
				+implemented in the function starpu_data_interface_ops::unpack_data which
			
 
				+takes a contiguous memory buffer and recreates the data handle.
			
 
				 
			
 
				 \code{.c}
			
 
				 static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count)
			
@@ -218,7 +219,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
				 };
			
 
				 \endcode
			
 
				 
			
 
				-\section MPI_Insert_Task_Utility MPI Insert Task Utility
			
 
				+\section MPIInsertTaskUtility MPI Insert Task Utility
			
 
				 
			
 
				 To save the programmer from having to explicit all communications, StarPU
			
 
				 provides an "MPI Insert Task Utility". The principe is that the application
			
@@ -229,7 +230,7 @@ exchange the content of the handle. All MPI nodes then process the whole task
 
				 graph, and StarPU automatically determines which node actually execute which
			
 
				 task, and trigger the required MPI transfers.
			
 
				 
			
 
				-The list of functions is described in \ref MPI_Insert_Task.
			
 
				+The list of functions is described in \ref API_MPI_Support.
			
 
				 
			
 
				 Here an stencil example showing how to use starpu_mpi_insert_task(). One
			
 
				 first needs to define a distribution function which specifies the
			
@@ -317,9 +318,9 @@ application can prune the task for loops according to the data distribution,
 
				 so as to only submit tasks on nodes which have to care about them (either to
			
 
				 execute them, or to send the required data).
			
 
				 
			
 
				-\section MPI_Collective_Operations MPI Collective Operations
			
 
				+\section MPICollectiveOperations MPI Collective Operations
			
 
				 
			
 
				-The functions are described in \ref Collective_Operations.
			
 
				+The functions are described in \ref API_MPI_Support.
			
 
				 
			
 
				 \code{.c}
			
 
				 if (rank == root)
			
--- a/doc/doxygen/chapters/optimize_performance.doxy
+++ b/doc/doxygen/chapters/optimize_performance.doxy
@@ -6,7 +6,7 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page optimizePerformance How to optimize performance with StarPU
			
 
				+/*! \page HowToOptimizePerformanceWithStarPU How To Optimize Performance With StarPU
			
 
				 
			
 
				 TODO: improve!
			
 
				 
			
@@ -14,7 +14,7 @@ Simply encapsulating application kernels into tasks already permits to
 
				 seamlessly support CPU and GPUs at the same time. To achieve good performance, a
			
 
				 few additional changes are needed.
			
 
				 
			
 
				-\section Data_management Data management
			
 
				+\section DataManagement Data Management
			
 
				 
			
 
				 When the application allocates data, whenever possible it should use
			
 
				 the function starpu_malloc(), which will ask CUDA or OpenCL to make
			
@@ -59,7 +59,7 @@ implicit dependencies on that data.
 
				 
			
 
				 In the same vein, accumulation of results in the same data can become a
			
 
				 bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
			
 
				-accumulation (see \ref Data_reduction). To a lesser extent, the use of
			
 
				+accumulation (see \ref DataReduction). To a lesser extent, the use of
			
 
				 the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
			
 
				 the accumulation to happen in any order.
			
 
				 
			
@@ -93,7 +93,7 @@ starpu_invalidate_submit(handle);
 
				 the buffers containing the current value will then be freed, and reallocated
			
 
				 only when another task writes some value to the handle.
			
 
				 
			
 
				-\section Task_granularity Task granularity
			
 
				+\section TaskGranularity Task Granularity
			
 
				 
			
 
				 Like any other runtime, StarPU has some overhead to manage tasks. Since
			
 
				 it does smart scheduling and data management, that overhead is not always
			
@@ -112,7 +112,7 @@ The choice of scheduler also has impact over the overhead: for instance, the
 
				 not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
			
 
				 impact that has on the target machine.
			
 
				 
			
 
				-\section Task_submission Task submission
			
 
				+\section TaskSubmission Task Submission
			
 
				 
			
 
				 To let StarPU make online optimizations, tasks should be submitted
			
 
				 asynchronously as much as possible. Ideally, all the tasks should be
			
@@ -121,24 +121,23 @@ starpu_data_unregister() be done to wait for
 
				 termination. StarPU will then be able to rework the whole schedule, overlap
			
 
				 computation with communication, manage accelerator local memory usage, etc.
			
 
				 
			
 
				-\section Task_priorities Task priorities
			
 
				+\section TaskPriorities Task Priorities
			
 
				 
			
 
				 By default, StarPU will consider the tasks in the order they are submitted by
			
 
				 the application. If the application programmer knows that some tasks should
			
 
				 be performed in priority (for instance because their output is needed by many
			
 
				-other tasks and may thus be a bottleneck if not executed early enough), the
			
 
				-<c>priority</c> field of the task structure should be set to transmit the
			
 
				+other tasks and may thus be a bottleneck if not executed early
			
 
				+enough), the field starpu_task::priority should be set to transmit the
			
 
				 priority information to StarPU.
			
 
				 
			
 
				-\section Task_scheduling_policy Task scheduling policy
			
 
				+\section TaskSchedulingPolicy Task Scheduling Policy
			
 
				 
			
 
				 By default, StarPU uses the <c>eager</c> simple greedy scheduler. This is
			
 
				 because it provides correct load balance even if the application codelets do not
			
 
				 have performance models. If your application codelets have performance models
			
 
				-(\ref Performance_model_example for example showing how to do it),
			
 
				-you should change the scheduler thanks to the <c>STARPU_SCHED</c> environment
			
 
				-variable. For instance <c>export STARPU_SCHED=dmda</c> . Use <c>help</c> to get
			
 
				-the list of available schedulers.
			
 
				+(\ref PerformanceModelExample), you should change the scheduler thanks
			
 
				+to the environment variable \ref STARPU_SCHED. For instance <c>export
			
 
				+STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
			
 
				 
			
 
				 The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
			
 
				 to work on. This however does not permit to prefetch data since the scheduling
			
@@ -177,12 +176,12 @@ parallel tasks (still experimental).
 
				 The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
			
 
				 supports parallel tasks (still experimental).
			
 
				 
			
 
				-\section Performance_model_calibration Performance model calibration
			
 
				+\section PerformanceModelCalibration Performance Model Calibration
			
 
				 
			
 
				 Most schedulers are based on an estimation of codelet duration on each kind
			
 
				 of processing unit. For this to be possible, the application programmer needs
			
 
				 to configure a performance model for the codelets of the application (see
			
 
				-\ref Performance_model_example for instance). History-based performance models
			
 
				+\ref PerformanceModelExample for instance). History-based performance models
			
 
				 use on-line calibration.  StarPU will automatically calibrate codelets
			
 
				 which have never been calibrated yet, and save the result in
			
 
				 <c>$STARPU_HOME/.starpu/sampling/codelets</c>.
			
@@ -273,7 +272,7 @@ void feed(void) {
 
				 Measurement has to be provided in milliseconds for the completion time models,
			
 
				 and in Joules for the energy consumption models.
			
 
				 
			
 
				-\section Task_distribution_vs_Data_transfer Task distribution vs Data transfer
			
 
				+\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
			
 
				 
			
 
				 Distributing tasks to balance the load induces data transfer penalty. StarPU
			
 
				 thus needs to find a balance between both. The target function that the
			
@@ -290,9 +289,10 @@ real application execution, contention makes transfer times bigger.
 
				 This is of course imprecise, but in practice, a rough estimation
			
 
				 already gives the good results that a precise estimation would give.
			
 
				 
			
 
				-\section Data_prefetch Data prefetch
			
 
				+\section DataPrefetch Data Prefetch
			
 
				 
			
 
				-The <c>heft</c>, <c>dmda</c> and <c>pheft</c> scheduling policies perform data prefetch (see @ref{STARPU_PREFETCH}):
			
 
				+The <c>heft</c>, <c>dmda</c> and <c>pheft</c> scheduling policies
			
 
				+perform data prefetch (see \ref STARPU_PREFETCH):
			
 
				 as soon as a scheduling decision is taken for a task, requests are issued to
			
 
				 transfer its required data to the target processing unit, if needeed, so that
			
 
				 when the processing unit actually starts the task, its data will hopefully be
			
@@ -307,7 +307,7 @@ distribution (since StarPU will try to avoid further transfers).
 
				 This can be achieved by giving the function starpu_data_prefetch_on_node()
			
 
				 the handle and the desired target memory node.
			
 
				 
			
 
				-\section Power-based_scheduling Power-based scheduling
			
 
				+\section Power-basedScheduling Power-based Scheduling
			
 
				 
			
 
				 If the application can provide some power performance model (through
			
 
				 the <c>power_model</c> field of the codelet structure), StarPU will
			
@@ -332,7 +332,7 @@ The power actually consumed by the total execution can be displayed by setting
 
				 On-line task consumption measurement is currently only supported through the
			
 
				 <c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
			
 
				 simulator. Applications can however provide explicit measurements by
			
 
				-using the function starpu_perfmodel_update_history() (examplified in \ref Performance_model_example
			
 
				+using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
			
 
				 with the <c>power_model</c> performance model. Fine-grain
			
 
				 measurement is often not feasible with the feedback provided by the hardware, so
			
 
				 the user can for instance run a given task a thousand times, measure the global
			
@@ -340,7 +340,7 @@ consumption for that series of tasks, divide it by a thousand, repeat for
 
				 varying kinds of tasks and task sizes, and eventually feed StarPU
			
 
				 with these manual measurements through starpu_perfmodel_update_history().
			
 
				 
			
 
				-\section Static_scheduling Static scheduling
			
 
				+\section StaticScheduling Static Scheduling
			
 
				 
			
 
				 In some cases, one may want to force some scheduling, for instance force a given
			
 
				 set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
			
@@ -369,7 +369,7 @@ Statistics on the execution can then be obtained by using <c>export
 
				 STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
			
 
				  More details on performance feedback are provided by the next chapter.
			
 
				 
			
 
				-\section CUDA-specific_optimizations CUDA-specific optimizations
			
 
				+\section CUDA-specificOptimizations CUDA-specific Optimizations
			
 
				 
			
 
				 Due to CUDA limitations, StarPU will have a hard time overlapping its own
			
 
				 communications and the codelet computations if the application does not use a
			
@@ -388,19 +388,19 @@ StarPU already does appropriate calls for the CUBLAS library.
 
				 Unfortunately, some CUDA libraries do not have stream variants of
			
 
				 kernels. That will lower the potential for overlapping.
			
 
				 
			
 
				-\section Performance_debugging Performance debugging
			
 
				+\section PerformanceDebugging Performance Debugging
			
 
				 
			
 
				 To get an idea of what is happening, a lot of performance feedback is available,
			
 
				 detailed in the next chapter. The various informations should be checked for.
			
 
				 
			
 
				 <ul>
			
 
				 <li>
			
 
				-What does the Gantt diagram look like? (see \ref Creating_a_Gantt_Diagram)
			
 
				+What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
			
 
				 <ul>
			
 
				-  <li> If it's mostly green (tasks running in the initial context) or context specific 
			
 
				+  <li> If it's mostly green (tasks running in the initial context) or context specific
			
 
				   color prevailing, then the machine is properly
			
 
				   utilized, and perhaps the codelets are just slow. Check their performance, see
			
 
				-  \ref Performance_of_codelets.
			
 
				+  \ref PerformanceOfCodelets.
			
 
				   </li>
			
 
				   <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
			
 
				   transfers, do you perhaps have far more communication than computation? Did
			
@@ -410,11 +410,11 @@ What does the Gantt diagram look like? (see \ref Creating_a_Gantt_Diagram)
 
				   </li>
			
 
				   <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
			
 
				   do you have enough parallelism? It might be a good idea to check what the DAG
			
 
				-  looks like (see \ref Creating_a_DAG_with_graphviz).
			
 
				+  looks like (see \ref CreatingADAGWithGraphviz).
			
 
				   </li>
			
 
				   <li> If only some workers are completely red (Blocked), for some reason the
			
 
				   scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
			
 
				-  check it (see \ref Performance_of_codelets). Do all your codelets have a
			
 
				+  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
			
 
				   performance model?  When some of them don't, the schedulers switches to a
			
 
				   greedy algorithm which thus performs badly.
			
 
				   </li>
			
@@ -422,10 +422,10 @@ What does the Gantt diagram look like? (see \ref Creating_a_Gantt_Diagram)
 
				 </li>
			
 
				 </ul>
			
 
				 
			
 
				-You can also use the Temanejo task debugger (see \ref Using_the_Temanejo_task_debugger) to
			
 
				+You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
			
 
				 visualize the task graph more easily.
			
 
				 
			
 
				-\section Simulated_performance Simulated performance
			
 
				+\section SimulatedPerformance Simulated Performance
			
 
				 
			
 
				 StarPU can use Simgrid in order to simulate execution on an arbitrary
			
 
				 platform.
			
@@ -452,7 +452,7 @@ times before the model is calibrated.
 
				 
			
 
				 \subsection Simulation Simulation
			
 
				 
			
 
				-Then, recompile StarPU, passing <c>--enable-simgrid</c> to <c>./configure</c>, and re-run the
			
 
				+Then, recompile StarPU, passing \ref enable-simgrid to <c>./configure</c>, and re-run the
			
 
				 application:
			
 
				 
			
 
				 \verbatim
			
@@ -476,15 +476,15 @@ $ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
 
				     matvecmult does not have a perfmodel, or is not calibrated enough
			
 
				 \endverbatim
			
 
				 
			
 
				-The number of devices can be chosen as usual with <c>STARPU_NCPU</c>,
			
 
				-<c>STARPU_NCUDA</c>, and <c>STARPU_NOPENCL</c>.  For now, only the number of
			
 
				+The number of devices can be chosen as usual with \ref STARPU_NCPU,
			
 
				+\ref STARPU_NCUDA, and \ref STARPU_NOPENCL.  For now, only the number of
			
 
				 cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
			
 
				 lower than the real number on the current machine.
			
 
				 
			
 
				 The amount of simulated GPU memory is for now unbound by default, but
			
 
				-it can be chosen by hand through the <c>STARPU_LIMIT_CUDA_MEM</c>,
			
 
				-<c>STARPU_LIMIT_CUDA_devid_MEM</c>, <c>STARPU_LIMIT_OPENCL_MEM</c>, and
			
 
				-<c>STARPU_LIMIT_OPENCL_devid_MEM</c> environment variables.
			
 
				+it can be chosen by hand through the \ref STARPU_LIMIT_CUDA_MEM,
			
 
				+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and
			
 
				+\ref STARPU_LIMIT_OPENCL_devid_MEM environment variables.
			
 
				 
			
 
				 The Simgrid default stack size is small; to increase it use the
			
 
				 parameter <c>--cfg=contexts/stack_size</c>, for example:
			
@@ -499,15 +499,16 @@ performance measurements, the real time will be used, which will be bogus. To
 
				 get the simulated time, it has to use starpu_timing_now() which returns the
			
 
				 virtual timestamp in ms.
			
 
				 
			
 
				-\subsection Simulation_on_another_machine Simulation on another machine
			
 
				+\subsection SimulationOnAnotherMachine Simulation On Another Machine
			
 
				 
			
 
				 The simgrid support even permits to perform simulations on another machine, your
			
 
				 desktop, typically. To achieve this, one still needs to perform the Calibration
			
 
				 step on the actual machine to be simulated, then copy them to your desktop
			
 
				 machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
			
 
				-Simulation step on the desktop machine, by setting the <c>STARPU_HOSTNAME</c>
			
 
				-environment variable to the name of the actual machine, to make StarPU use the
			
 
				-performance models of the simulated machine even on the desktop machine.
			
 
				+Simulation step on the desktop machine, by setting the environment
			
 
				+variable \ref STARPU_HOSTNAME to the name of the actual machine, to
			
 
				+make StarPU use the performance models of the simulated machine even
			
 
				+on the desktop machine.
			
 
				 
			
 
				 If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
			
 
				 use simgrid to simulate execution with CUDA/OpenCL devices, but the application
			
--- a/doc/doxygen/chapters/performance_feedback.doxy
+++ b/doc/doxygen/chapters/performance_feedback.doxy
@@ -6,9 +6,9 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page performanceFeedback Performance Feedback
			
 
				+/*! \page PerformanceFeedback Performance Feedback
			
 
				 
			
 
				-\section Using_the_Temanejo_task_debugger Using the Temanejo task debugger
			
 
				+\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
			
 
				 
			
 
				 StarPU can connect to Temanejo (see
			
 
				 http://www.hlrs.de/temanejo), to permit
			
@@ -25,9 +25,9 @@ of Temanejo should be able to tell StarPU the number of CPUs to use.
 
				 Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
			
 
				 Temanejo (so as to distinguish them from tasks).
			
 
				 
			
 
				-\section On-line_performance_feedback On-line performance feedback
			
 
				+\section On-linePerformanceFeedback On-line Performance Feedback
			
 
				 
			
 
				-\subsection Enabling_on-line_performance_monitoring Enabling on-line performance monitoring
			
 
				+\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
			
 
				 
			
 
				 In order to enable online performance monitoring, the application can
			
 
				 call starpu_profiling_status_set() with the parameter
			
@@ -43,22 +43,22 @@ starpu_profiling_status_set() with the parameter
 
				 performance counters so that the application may consult them later
			
 
				 on.
			
 
				 
			
 
				-More details about the performance monitoring API are available in section
			
 
				-\ref Profiling_API.
			
 
				+More details about the performance monitoring API are available in \ref API_Profiling.
			
 
				 
			
 
				-\subsection Per-Task_feedback Per-task feedback
			
 
				+\subsection Per-taskFeedback Per-task Feedback
			
 
				 
			
 
				-If profiling is enabled, a pointer to a struct
			
 
				+If profiling is enabled, a pointer to a structure
			
 
				 starpu_profiling_task_info is put in the field
			
 
				 starpu_task::profiling_info when a task terminates. This structure is
			
 
				 automatically destroyed when the task structure is destroyed, either
			
 
				 automatically or by calling starpu_task_destroy().
			
 
				 
			
 
				 The structure starpu_profiling_task_info indicates the date when the
			
 
				-task was submitted (<c>submit_time</c>), started (<c>start_time</c>), and
			
 
				-terminated (<c>end_time</c>), relative to the initialization of
			
 
				-StarPU with starpu_init(). It also specifies the identifier of the worker
			
 
				-that has executed the task (<c>workerid</c>).
			
 
				+task was submitted (starpu_profiling_task_info::submit_time), started
			
 
				+(starpu_profiling_task_info::start_time), and terminated
			
 
				+(starpu_profiling_task_info::end_time), relative to the initialization
			
 
				+of StarPU with starpu_init(). It also specifies the identifier of the worker
			
 
				+that has executed the task (starpu_profiling_task_info::workerid).
			
 
				 These date are stored as <c>timespec</c> structures which the user may convert
			
 
				 into micro-seconds using the helper function
			
 
				 starpu_timing_timespec_to_us().
			
@@ -68,24 +68,26 @@ the callback executed at the end of the task. The structure starpu_task
 
				 associated to the callback currently being executed is indeed accessible with
			
 
				 the function starpu_task_get_current().
			
 
				 
			
 
				-\subsection Per-codelet_feedback Per-codelet feedback
			
 
				+\subsection Per-codeletFeedback Per-codelet Feedback
			
 
				 
			
 
				 The field starpu_codelet::per_worker_stats is
			
 
				 an array of counters. The i-th entry of the array is incremented every time a
			
 
				 task implementing the codelet is executed on the i-th worker.
			
 
				 This array is not reinitialized when profiling is enabled or disabled.
			
 
				 
			
 
				-\subsection Per-worker_feedback Per-worker feedback
			
 
				+\subsection Per-workerFeedback Per-worker Feedback
			
 
				 
			
 
				 The second argument returned by the function
			
 
				 starpu_profiling_worker_get_info() is a structure
			
 
				 starpu_profiling_worker_info that gives statistics about the specified
			
 
				 worker. This structure specifies when StarPU started collecting
			
 
				-profiling information for that worker (<c>start_time</c>), the
			
 
				-duration of the profiling measurement interval (<c>total_time</c>),
			
 
				-the time spent executing kernels (<c>executing_time</c>), the time
			
 
				+profiling information for that worker
			
 
				+(starpu_profiling_worker_info::start_time), the
			
 
				+duration of the profiling measurement interval
			
 
				+(starpu_profiling_worker_info::total_time), the time spent executing
			
 
				+kernels (starpu_profiling_worker_info::executing_time), the time
			
 
				 spent sleeping because there is no task to execute at all
			
 
				-(<c>sleeping_time</c>), and the number of tasks that were executed
			
 
				+(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
			
 
				 while profiling was enabled. These values give an estimation of the
			
 
				 proportion of time spent do real work, and the time spent either
			
 
				 sleeping because there are not enough executable tasks or simply
			
@@ -94,14 +96,14 @@ wasted in pure StarPU overhead.
 
				 Calling starpu_profiling_worker_get_info() resets the profiling
			
 
				 information associated to a worker.
			
 
				 
			
 
				-When an FxT trace is generated (see \ref Generating_traces_with_FxT), it is also
			
 
				+When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
			
 
				 possible to use the tool <c>starpu_workers_activity</c> (see \ref
			
 
				-Monitoring_activity) to generate a graphic showing the evolution of
			
 
				+MonitoringActivity) to generate a graphic showing the evolution of
			
 
				 these values during the time, for the different workers.
			
 
				 
			
 
				-\subsection Bus-related_feedback Bus-related feedback
			
 
				+\subsection Bus-relatedFeedback Bus-related Feedback
			
 
				 
			
 
				-TODO: ajouter STARPU_BUS_STATS
			
 
				+TODO: ajouter \ref STARPU_BUS_STATS
			
 
				 
			
 
				 \internal
			
 
				 how to enable/disable performance monitoring
			
@@ -124,7 +126,7 @@ CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
 
				 CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
			
 
				 \endverbatim
			
 
				 
			
 
				-\subsection StarPU-Top_interface StarPU-Top interface
			
 
				+\subsection StarPU-TopInterface StarPU-Top Interface
			
 
				 
			
 
				 StarPU-Top is an interface which remotely displays the on-line state of a StarPU
			
 
				 application and permits the user to change parameters on the fly.
			
@@ -203,9 +205,9 @@ and "localhost" should be used as IP Address to connect to.
 
				 </li>
			
 
				 </ul>
			
 
				 
			
 
				-\section Off-line_performance_feedback Off-line performance feedback
			
 
				+\section Off-linePerformanceFeedback Off-line Performance Feedback
			
 
				 
			
 
				-\subsection Generating_traces_with_FxT Generating traces with FxT
			
 
				+\subsection GeneratingTracesWithFxT Generating Traces With FxT
			
 
				 
			
 
				 StarPU can use the FxT library (see
			
 
				 https://savannah.nongnu.org/projects/fkt/) to generate traces
			
@@ -234,23 +236,24 @@ $ make install
 
				 \endverbatim
			
 
				 
			
 
				 In order to have StarPU to generate traces, StarPU should be configured with
			
 
				-the <c>--with-fxt</c> option:
			
 
				+the option \ref with-fxt :
			
 
				 
			
 
				 \verbatim
			
 
				 $ ./configure --with-fxt=$FXTDIR
			
 
				 \endverbatim
			
 
				 
			
 
				 Or you can simply point the <c>PKG_CONFIG_PATH</c> to
			
 
				-<c>$FXTDIR/lib/pkgconfig</c> and pass <c>--with-fxt</c> to <c>./configure</c>
			
 
				+<c>$FXTDIR/lib/pkgconfig</c> and pass \ref with-fx to
			
 
				+<c>./configure</c>
			
 
				 
			
 
				 When FxT is enabled, a trace is generated when StarPU is terminated by calling
			
 
				 starpu_shutdown()). The trace is a binary file whose name has the form
			
 
				 <c>prof_file_XXX_YYY</c> where <c>XXX</c> is the user name, and
			
 
				 <c>YYY</c> is the pid of the process that used StarPU. This file is saved in the
			
 
				 <c>/tmp/</c> directory by default, or by the directory specified by
			
 
				-the <c>STARPU_FXT_PREFIX</c> environment variable.
			
 
				+the environment variable \ref STARPU_FXT_PREFIX.
			
 
				 
			
 
				-\subsection Creating_a_Gantt_Diagram Creating a Gantt Diagram
			
 
				+\subsection CreatingAGanttDiagram Creating a Gantt Diagram
			
 
				 
			
 
				 When the FxT trace file <c>filename</c> has been generated, it is possible to
			
 
				 generate a trace in the Paje format by calling:
			
@@ -259,11 +262,11 @@ generate a trace in the Paje format by calling:
 
				 $ starpu_fxt_tool -i filename
			
 
				 \endverbatim
			
 
				 
			
 
				-Or alternatively, setting the <c>STARPU_GENERATE_TRACE</c> environment variable
			
 
				+Or alternatively, setting the environment variable \ref STARPU_GENERATE_TRACE
			
 
				 to <c>1</c> before application execution will make StarPU do it automatically at
			
 
				 application shutdown.
			
 
				 
			
 
				-This will create a <c>paje.trace</c> file in the current directory that
			
 
				+This will create a file <c>paje.trace</c> in the current directory that
			
 
				 can be inspected with the <a href="http://vite.gforge.inria.fr/">ViTE trace
			
 
				 visualizing open-source tool</a>.  It is possible to open the
			
 
				 <c>paje.trace</c> file with ViTE by using the following command:
			
@@ -272,11 +275,11 @@ visualizing open-source tool</a>.  It is possible to open the
 
				 $ vite paje.trace
			
 
				 \endverbatim
			
 
				 
			
 
				-To get names of tasks instead of "unknown", fill the optional <c>name</c> field
			
 
				-of the codelets, or use a performance model for them.
			
 
				+To get names of tasks instead of "unknown", fill the optional
			
 
				+starpu_codelet::name, or use a performance model for them.
			
 
				 
			
 
				 In the MPI execution case, collect the trace files from the MPI nodes, and
			
 
				-specify them all on the <c>starpu_fxt_tool</c> command, for instance:
			
 
				+specify them all on the command <c>starpu_fxt_tool</c>, for instance:
			
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_fxt_tool -i filename1 -i filename2
			
@@ -285,7 +288,7 @@ $ starpu_fxt_tool -i filename1 -i filename2
 
				 By default, all tasks are displayed using a green color. To display tasks with
			
 
				 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
			
 
				 
			
 
				-Traces can also be inspected by hand by using the <c>fxt_print</c> tool, for instance:
			
 
				+Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
			
 
				 
			
 
				 \verbatim
			
 
				 $ fxt_print -o -f filename
			
@@ -293,7 +296,7 @@ $ fxt_print -o -f filename
 
				 
			
 
				 Timings are in nanoseconds (while timings as seen in <c>vite</c> are in milliseconds).
			
 
				 
			
 
				-\subsection Creating_a_DAG_with_graphviz Creating a DAG with graphviz
			
 
				+\subsection CreatingADAGWithGraphviz Creating a DAG With Graphviz
			
 
				 
			
 
				 When the FxT trace file <c>filename</c> has been generated, it is possible to
			
 
				 generate a task graph in the DOT format by calling:
			
@@ -310,7 +313,7 @@ graphical output of the graph by using the graphviz library:
 
				 $ dot -Tpdf dag.dot -o output.pdf
			
 
				 \endverbatim
			
 
				 
			
 
				-\subsection Monitoring_activity Monitoring activity
			
 
				+\subsection MonitoringActivity Monitoring Activity
			
 
				 
			
 
				 When the FxT trace file <c>filename</c> has been generated, it is possible to
			
 
				 generate an activity trace by calling:
			
@@ -343,10 +346,10 @@ evolution of the number of tasks available in the system during the execution.
 
				 Ready tasks are shown in black, and tasks that are submitted but not
			
 
				 schedulable yet are shown in grey.
			
 
				 
			
 
				-\section Performance_of_codelets Performance of codelets
			
 
				+\section PerformanceOfCodelets Performance Of Codelets
			
 
				 
			
 
				-The performance model of codelets (see \ref Performance_model_example) can be examined by using the
			
 
				-tool <c>starpu_perfmodel_display</c>:
			
 
				+The performance model of codelets (see \ref PerformanceModelExample)
			
 
				+can be examined by using the tool <c>starpu_perfmodel_display</c>:
			
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_perfmodel_display -l
			
@@ -405,7 +408,7 @@ a3d3725e	4096           	4.763200e+00   	7.650928e-01   	100
 
				 \endverbatim
			
 
				 
			
 
				 The same can also be achieved by using StarPU's library API, see
			
 
				-\ref Performance_Model_API and notably the function
			
 
				+\ref API_Performance_Model and notably the function
			
 
				 starpu_perfmodel_load_symbol(). The source code of the tool
			
 
				 <c>starpu_perfmodel_display</c> can be a useful example.
			
 
				 
			
@@ -413,7 +416,7 @@ The tool <c>starpu_perfmodel_plot</c> can be used to draw performance
 
				 models. It writes a <c>.gp</c> file in the current directory, to be
			
 
				 run in the <c>gnuplot</c> tool, which shows the corresponding curve.
			
 
				 
			
 
				-When the <c>flops</c> field of tasks is set, <c>starpu_perfmodel_plot</c> can
			
 
				+When the field starpu_task::flops is set, <c>starpu_perfmodel_plot</c> can
			
 
				 directly draw a GFlops curve, by simply adding the <c>-f</c> option:
			
 
				 
			
 
				 \verbatim
			
@@ -445,7 +448,7 @@ $ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_
 
				 It will produce a <c>.gp</c> file which contains both the performance model
			
 
				 curves, and the profiling measurements.
			
 
				 
			
 
				-If you have the R statistical tool installed, you can additionally use
			
 
				+If you have the <c>R</c> statistical tool installed, you can additionally use
			
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_codelet_histo_profile distrib.data
			
@@ -454,7 +457,7 @@ $ starpu_codelet_histo_profile distrib.data
 
				 Which will create one pdf file per codelet and per input size, showing a
			
 
				 histogram of the codelet execution time distribution.
			
 
				 
			
 
				-\section Theoretical_lower_bound_on_execution_time Theoretical lower bound on execution time
			
 
				+\section TheoreticalLowerBoundOnExecutionTime Theoretical Lower Bound On Execution Time
			
 
				 
			
 
				 StarPU can record a trace of what tasks are needed to complete the
			
 
				 application, and then, by using a linear system, provide a theoretical lower
			
@@ -466,19 +469,19 @@ near to the bound computed with dependencies enabled (which takes a huge lot
 
				 more time to compute), and thus provides a good-enough estimation of the ideal
			
 
				 execution time.
			
 
				 
			
 
				-\ref Theoretical_lower_bound_on_execution_time provides an example on how to
			
 
				+\ref TheoreticalLowerBoundOnExecutionTimeExample provides an example on how to
			
 
				 use this.
			
 
				 
			
 
				-\section Memory_feedback Memory feedback
			
 
				+\section MemoryFeedback Memory Feedback
			
 
				 
			
 
				-It is possible to enable memory statistics. To do so, you need to pass the option
			
 
				-<c>--enable-memory-stats</c> when running configure. It is then
			
 
				+It is possible to enable memory statistics. To do so, you need to pass
			
 
				+the option \ref enable-memory-stats when running configure. It is then
			
 
				 possible to call the function starpu_display_memory_stats() to
			
 
				 display statistics about the current data handles registered within StarPU.
			
 
				 
			
 
				 Moreover, statistics will be displayed at the end of the execution on
			
 
				 data handles which have not been cleared out. This can be disabled by
			
 
				-setting the environment variable <c>STARPU_MEMORY_STATS</c> to 0.
			
 
				+setting the environment variable \ref STARPU_MEMORY_STATS to 0.
			
 
				 
			
 
				 For example, if you do not unregister data at the end of the complex
			
 
				 example, you will get something similar to:
			
@@ -541,15 +544,15 @@ Node #3
 
				 	Invalidated (was Owner) : 0
			
 
				 \endverbatim
			
 
				 
			
 
				-\section Data_statistics Data statistics
			
 
				+\section DataStatistics Data Statistics
			
 
				 
			
 
				 Different data statistics can be displayed at the end of the execution
			
 
				-of the application. To enable them, you need to pass the option
			
 
				-<c>--enable-stats</c> when calling <c>configure</c>. When calling
			
 
				+of the application. To enable them, you need to pass the option \ref
			
 
				+enable-stats when calling <c>configure</c>. When calling
			
 
				 starpu_shutdown() various statistics will be displayed,
			
 
				 execution, MSI cache statistics, allocation cache statistics, and data
			
 
				 transfer statistics. The display can be disabled by setting the
			
 
				-environment variable <c>STARPU_STATS</c> to 0.
			
 
				+environment variable \ref STARPU_STATS to 0.
			
 
				 
			
 
				 \verbatim
			
 
				 $ ./examples/cholesky/cholesky_tag
			
--- a/doc/doxygen/chapters/scheduling_context_hypervisor.doxy
+++ b/doc/doxygen/chapters/scheduling_context_hypervisor.doxy
@@ -6,9 +6,9 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page schedulingContextHypervisor Scheduling Context Hypervisor
			
 
				+/*! \page SchedulingContextHypervisor Scheduling Context Hypervisor
			
 
				 
			
 
				-\section What_is_the_Hypervisor What is the Hypervisor
			
 
				+\section WhatIsTheHypervisor What Is The Hypervisor
			
 
				 
			
 
				 StarPU proposes a platform for constructing Scheduling Contexts, for
			
 
				 deleting and modifying them dynamically. A parallel kernel, can thus
			
@@ -29,7 +29,7 @@ decides accordingly when and how the contexts can be resized. Basic
 
				 strategies of resizing scheduling contexts already exist but a
			
 
				 platform for implementing additional custom ones is available.
			
 
				 
			
 
				-\section Start_the_Hypervisor Start the Hypervisor
			
 
				+\section StartTheHypervisor Start the Hypervisor
			
 
				 
			
 
				 The Hypervisor must be initialised once at the beging of the
			
 
				 application. At this point a resizing policy should be indicated. This
			
@@ -42,7 +42,7 @@ information is provided the hypervisor evaluates the behavior of the
 
				 resources and of the application and makes a guess about the future.
			
 
				 The hypervisor resizes only the registered contexts.
			
 
				 
			
 
				-\section Interrogate_the_runtime Interrrogate the runtime
			
 
				+\section InterrogateTheRuntime Interrogate The Runtime
			
 
				 
			
 
				 The runtime provides the hypervisor with information concerning the
			
 
				 behavior of the resources and the application. This is done by using
			
@@ -50,7 +50,7 @@ the performance_counters, some callbacks indicating when the resources
 
				 are idle or not efficient, when the application submits tasks or when
			
 
				 it becames to slow.
			
 
				 
			
 
				-\section Trigger_the_Hypervisor Trigger the Hypervisor
			
 
				+\section TriggerTheHypervisor Trigger the Hypervisor
			
 
				 
			
 
				 The resizing is triggered either when the application requires it or
			
 
				 when the initials distribution of resources alters the performance of
			
@@ -60,7 +60,7 @@ happens different resizing strategy are applied that target minimising
 
				 the total execution of the application, the instant speed or the idle
			
 
				 time of the resources.
			
 
				 
			
 
				-\section Resizing_strategies Resizing strategies
			
 
				+\section ResizingStrategies Resizing Strategies
			
 
				 
			
 
				 The plugin proposes several strategies for resizing the scheduling context.
			
 
				 
			
--- a/doc/doxygen/chapters/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/scheduling_contexts.doxy
@@ -6,11 +6,11 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page schedulingContexts Scheduling Contexts in StarPU
			
 
				+/*! \page SchedulingContexts Scheduling Contexts
			
 
				 
			
 
				 TODO: improve!
			
 
				 
			
 
				-\section General_Idea General Idea
			
 
				+\section GeneralIdeas General Ideas
			
 
				 
			
 
				 Scheduling contexts represent abstracts sets of workers that allow the
			
 
				 programmers to control the distribution of computational resources
			
@@ -19,7 +19,7 @@ to minimize interferences between the execution of multiple parallel
 
				 kernels, by partitioning the underlying pool of workers using
			
 
				 contexts.
			
 
				 
			
 
				-\section Create_a_Context Create a Context
			
 
				+\section CreatingAContext Creating A Context
			
 
				 
			
 
				 By default, the application submits tasks to an initial context, which
			
 
				 disposes of all the computation ressources available to StarPU (all
			
@@ -56,7 +56,7 @@ starpu_task_submit(task);
 
				 Note: Parallel greedy and parallel heft scheduling policies do not support the existence of several disjoint contexts on the machine.
			
 
				 Combined workers are constructed depending on the entire topology of the machine, not only the one belonging to a context.
			
 
				 
			
 
				-\section Modify_a_Context Modify a Context
			
 
				+\section ModifyingAContext Modifying A Context
			
 
				 
			
 
				 A scheduling context can be modified dynamically. The applications may
			
 
				 change its requirements during the execution and the programmer can
			
@@ -77,7 +77,7 @@ starpu_sched_ctx_add_workers(workerids, 3, sched_ctx2);
 
				 starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
			
 
				 \endcode
			
 
				 
			
 
				-\section Delete_a_Context Delete a Context
			
 
				+\section DeletingAContext Deleting A Context
			
 
				 
			
 
				 When a context is no longer needed it must be deleted. The application
			
 
				 can indicate which context should keep the resources of a deleted one.
			
@@ -111,7 +111,7 @@ starpu_sched_ctx_delete(sched_ctx2);
 
				 starpu_sched_ctx_delete(sched_ctx1);
			
 
				 \endcode
			
 
				 
			
 
				-\section Empty_Context Empty Context
			
 
				+\section EmptyingAContext Emptying A Context
			
 
				 
			
 
				 A context may not have any resources at the begining or at a certain
			
 
				 moment of the execution. Task can still be submitted to these contexts
			
@@ -122,7 +122,7 @@ allocated the program will not terminate. If these tasks have not much
 
				 priority the programmer can forbid the application to submitted them
			
 
				 by calling the function starpu_sched_ctx_stop_task_submission().
			
 
				 
			
 
				-\section Contexts_Sharing_Workers Contexts Sharing Workers
			
 
				+\section ContextsSharingWorkers Contexts Sharing Workers
			
 
				 
			
 
				 Contexts may share workers when a single context cannot execute
			
 
				 efficiently enough alone on these workers or when the application
			
--- a/doc/doxygen/chapters/socl_opencl_extensions.doxy
+++ b/doc/doxygen/chapters/socl_opencl_extensions.doxy
@@ -6,7 +6,7 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page soclOpenclExtensions SOCL OpenCL Extensions
			
 
				+/*! \page SOCLOpenclExtensions SOCL OpenCL Extensions
			
 
				 
			
 
				 SOCL is an OpenCL implementation based on StarPU. It gives a unified access to
			
 
				 every available OpenCL device: applications can now share entities such as
			
--- a/doc/doxygen/chapters/tips_and_tricks.doxy
+++ b/doc/doxygen/chapters/tips_and_tricks.doxy
@@ -6,9 +6,9 @@
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page tipsTricks Tips and Tricks to know about
			
 
				+/*! \page TipsAndTricksToKnowAbout Tips and Tricks To Know About
			
 
				 
			
 
				-\section How_to_initialize_a_computation_library_once_for_each_worker How to initialize a computation library once for each worker?
			
 
				+\section HowToInitializeAComputationLibraryOnceForEachWorker How To Initialize A Computation Library Once For Each Worker?
			
 
				 
			
 
				 Some libraries need to be initialized once for each concurrent instance that
			
 
				 may run on the machine. For instance, a C++ computation class which is not
			
@@ -67,18 +67,18 @@ void starpu_cublas_init(void)
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-\section How_to_limit_memory_per_node How to limit memory per node
			
 
				+\section HowToLimitMemoryPerNode How to limit memory per node
			
 
				 
			
 
				 TODO
			
 
				 
			
 
				 Talk about
			
 
				-<c>STARPU_LIMIT_CUDA_devid_MEM</c>, <c>STARPU_LIMIT_CUDA_MEM</c>,
			
 
				-<c>STARPU_LIMIT_OPENCL_devid_MEM</c>, <c>STARPU_LIMIT_OPENCL_MEM</c>
			
 
				-and <c>STARPU_LIMIT_CPU_MEM</c>
			
 
				+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
			
 
				+\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
			
 
				+and \ref STARPU_LIMIT_CPU_MEM
			
 
				 
			
 
				 starpu_memory_get_available()
			
 
				 
			
 
				-\section Thread_Binding_on_NetBSD Thread Binding on NetBSD
			
 
				+\section ThreadBindingOnNetBSD Thread Binding on NetBSD
			
 
				 
			
 
				 When using StarPU on a NetBSD machine, if the topology
			
 
				 discovery library <c>hwloc</c> is used, thread binding will fail. To
			
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -99,45 +99,45 @@ Documentation License”.
 
				 \hypertarget{index}{}
			
 
				 \input{index}
			
 
				 
			
 
				-\chapter{Building and Installing Star\-P\-U}
			
 
				-\label{buildingAndInstalling}
			
 
				-\hypertarget{buildingAndInstalling}{}
			
 
				-\input{buildingAndInstalling}
			
 
				+\chapter{Building and Installing StarPU}
			
 
				+\label{BuildingAndInstallingStarPU}
			
 
				+\hypertarget{BuildingAndInstallingStarPU}{}
			
 
				+\input{BuildingAndInstallingStarPU}
			
 
				 
			
 
				 \chapter{Basic Examples}
			
 
				-\label{basicExamples}
			
 
				-\hypertarget{basicExamples}{}
			
 
				-\input{basicExamples}
			
 
				+\label{BasicExamples}
			
 
				+\hypertarget{BasicExamples}{}
			
 
				+\input{BasicExamples}
			
 
				 
			
 
				 \chapter{Advanced Examples}
			
 
				-\label{advancedExamples}
			
 
				-\hypertarget{advancedExamples}{}
			
 
				-\input{advancedExamples}
			
 
				+\label{AdvancedExamples}
			
 
				+\hypertarget{AdvancedExamples}{}
			
 
				+\input{AdvancedExamples}
			
 
				 
			
 
				 \chapter{How to optimize performance with StarPU}
			
 
				-\label{optimizePerformance}
			
 
				-\hypertarget{optimizePerformance}{}
			
 
				-\input{optimizePerformance}
			
 
				+\label{HowToOptimizePerformanceWithStarPU}
			
 
				+\hypertarget{HowToOptimizePerformanceWithStarPU}{}
			
 
				+\input{HowToOptimizePerformanceWithStarPU}
			
 
				 
			
 
				 \chapter{Performance Feedback}
			
 
				-\label{performanceFeedback}
			
 
				-\hypertarget{performanceFeedback}{}
			
 
				-\input{performanceFeedback}
			
 
				+\label{PerformanceFeedback}
			
 
				+\hypertarget{PerformanceFeedback}{}
			
 
				+\input{PerformanceFeedback}
			
 
				 
			
 
				-\chapter{Tips and Tricks to know about}
			
 
				-\label{tipsTricks}
			
 
				-\hypertarget{tipsTricks}{}
			
 
				-\input{tipsTricks}
			
 
				+\chapter{Tips and Tricks To Know About}
			
 
				+\label{TipsAndTricksToKnowAbout}
			
 
				+\hypertarget{TipsAndTricksToKnowAbout}{}
			
 
				+\input{TipsAndTricksToKnowAbout}
			
 
				 
			
 
				-\chapter{StarPU MPI Support}
			
 
				-\label{mpiSupport}
			
 
				-\hypertarget{mpiSupport}{}
			
 
				-\input{mpiSupport}
			
 
				+\chapter{MPI Support}
			
 
				+\label{MPISupport}
			
 
				+\hypertarget{MPISupport}{}
			
 
				+\input{MPISupport}
			
 
				 
			
 
				-\chapter{StarPU FFT Support}
			
 
				-\label{fftSupport}
			
 
				-\hypertarget{fftSupport}{}
			
 
				-\input{fftSupport}
			
 
				+\chapter{FFT Support}
			
 
				+\label{FFTSupport}
			
 
				+\hypertarget{FFTSupport}{}
			
 
				+\input{FFTSupport}
			
 
				 
			
 
				 \chapter{C Extensions}
			
 
				 \label{cExtensions}
			
@@ -145,31 +145,31 @@ Documentation License”.
 
				 \input{cExtensions}
			
 
				 
			
 
				 \chapter{SOCL OpenCL Extensions}
			
 
				-\label{soclOpenclExtensions}
			
 
				-\hypertarget{soclOpenclExtensions}{}
			
 
				-\input{soclOpenclExtensions}
			
 
				+\label{SOCLOpenclExtensions}
			
 
				+\hypertarget{SOCLOpenclExtensions}{}
			
 
				+\input{SOCLOpenclExtensions}
			
 
				 
			
 
				-\chapter{Scheduling Contexts in StarPU}
			
 
				-\label{schedulingContexts}
			
 
				-\hypertarget{schedulingContexts}{}
			
 
				-\input{schedulingContexts}
			
 
				+\chapter{Scheduling Contexts}
			
 
				+\label{SchedulingContexts}
			
 
				+\hypertarget{SchedulingContexts}{}
			
 
				+\input{SchedulingContexts}
			
 
				 
			
 
				 \chapter{Scheduling Context Hypervisor}
			
 
				-\label{schedulingContextHypervisor}
			
 
				-\hypertarget{schedulingContextHypervisor}{}
			
 
				-\input{schedulingContextHypervisor}
			
 
				+\label{SchedulingContextHypervisor}
			
 
				+\hypertarget{SchedulingContextHypervisor}{}
			
 
				+\input{SchedulingContextHypervisor}
			
 
				 
			
 
				 \part{Inside StarPU}
			
 
				 
			
 
				 \chapter{Execution Configuration Through Environment Variables}
			
 
				-\label{EnvironmentVariables}
			
 
				-\hypertarget{EnvironmentVariables}{}
			
 
				-\input{EnvironmentVariables}
			
 
				+\label{ExecutionConfigurationThroughEnvironmentVariables}
			
 
				+\hypertarget{ExecutionConfigurationThroughEnvironmentVariables}{}
			
 
				+\input{ExecutionConfigurationThroughEnvironmentVariables}
			
 
				 
			
 
				 \chapter{Compilation Configuration}
			
 
				-\label{ConfigureOptions}
			
 
				-\hypertarget{ConfigureOptions}{}
			
 
				-\input{ConfigureOptions}
			
 
				+\label{CompilationConfiguration}
			
 
				+\hypertarget{CompilationConfiguration}{}
			
 
				+\input{CompilationConfiguration}
			
 
				 
			
 
				 \chapter{Module Index}
			
 
				 \input{modules}