8 年之前 · c3b3e92719
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009, 2011, 2013-2014  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				 # Copyright (C) 2014  INRIA
			
 
				 #
			
 
				 # Permission is granted to copy, distribute and/or modify this document
			
@@ -208,39 +208,45 @@ dox_inputs = $(DOX_CONFIG) 				\
 
				 	chapters/version.sty				\
			
 
				 	chapters/version.html				\
			
 
				 	$(top_srcdir)/include/starpu.h			\
			
 
				+	$(top_srcdir)/include/starpu_bitmap.h		\
			
 
				+	$(top_srcdir)/include/starpu_bound.h		\
			
 
				+	$(top_srcdir)/include/starpu_clusters_util.h	\
			
 
				+	$(top_srcdir)/include/starpu_cublas.h		\
			
 
				+	$(top_srcdir)/include/starpu_cuda.h		\
			
 
				 	$(top_srcdir)/include/starpu_data_filters.h	\
			
 
				+	$(top_srcdir)/include/starpu_data.h		\
			
 
				 	$(top_srcdir)/include/starpu_data_interfaces.h	\
			
 
				+	$(top_srcdir)/include/starpu_deprecated_api.h	\
			
 
				 	$(top_srcdir)/include/starpu_disk.h		\
			
 
				-	$(top_srcdir)/include/starpu_worker.h		\
			
 
				-	$(top_srcdir)/include/starpu_task.h		\
			
 
				-	$(top_srcdir)/include/starpu_task_bundle.h	\
			
 
				-	$(top_srcdir)/include/starpu_task_list.h	\
			
 
				-	$(top_srcdir)/include/starpu_task_util.h	\
			
 
				-	$(top_srcdir)/include/starpu_data.h		\
			
 
				-	$(top_srcdir)/include/starpu_perfmodel.h	\
			
 
				-	$(top_srcdir)/include/starpu_util.h		\
			
 
				+	$(top_srcdir)/include/starpu_driver.h		\
			
 
				+	$(top_srcdir)/include/starpu_expert.h		\
			
 
				 	$(top_srcdir)/include/starpu_fxt.h		\
			
 
				-	$(top_srcdir)/include/starpu_cuda.h		\
			
 
				+	$(top_srcdir)/include/starpu_hash.h		\
			
 
				+	$(top_srcdir)/include/starpu_mic.h		\
			
 
				+	$(top_srcdir)/include/starpu_mod.f90		\
			
 
				 	$(top_srcdir)/include/starpu_opencl.h		\
			
 
				 	$(top_srcdir)/include/starpu_openmp.h		\
			
 
				-	$(top_srcdir)/include/starpu_sink.h		\
			
 
				-	$(top_srcdir)/include/starpu_mic.h		\
			
 
				-	$(top_srcdir)/include/starpu_scc.h		\
			
 
				-	$(top_srcdir)/include/starpu_expert.h		\
			
 
				+	$(top_srcdir)/include/starpu_perfmodel.h	\
			
 
				 	$(top_srcdir)/include/starpu_profiling.h	\
			
 
				-	$(top_srcdir)/include/starpu_bound.h		\
			
 
				-	$(top_srcdir)/include/starpu_scheduler.h	\
			
 
				-	$(top_srcdir)/include/starpu_sched_ctx.h	\
			
 
				-	$(top_srcdir)/include/starpu_clusters_util.h			\
			
 
				-	$(top_srcdir)/include/starpu_sched_ctx_hypervisor.h		\
			
 
				-	$(top_srcdir)/include/starpu_top.h		\
			
 
				-	$(top_srcdir)/include/starpu_hash.h		\
			
 
				 	$(top_srcdir)/include/starpu_rand.h		\
			
 
				-	$(top_srcdir)/include/starpu_cublas.h		\
			
 
				-	$(top_srcdir)/include/starpu_driver.h		\
			
 
				+	$(top_srcdir)/include/starpu_scc.h		\
			
 
				+	$(top_srcdir)/include/starpu_sched_component.h	\
			
 
				+	$(top_srcdir)/include/starpu_sched_ctx.h	\
			
 
				+	$(top_srcdir)/include/starpu_sched_ctx_hypervisor.h	\
			
 
				+	$(top_srcdir)/include/starpu_scheduler.h	\
			
 
				+	$(top_srcdir)/include/starpu_simgrid_wrap.h	\
			
 
				+	$(top_srcdir)/include/starpu_sink.h		\
			
 
				 	$(top_srcdir)/include/starpu_stdlib.h		\
			
 
				+	$(top_srcdir)/include/starpu_task_bundle.h	\
			
 
				+	$(top_srcdir)/include/starpu_task.h		\
			
 
				+	$(top_srcdir)/include/starpu_task_list.h	\
			
 
				+	$(top_srcdir)/include/starpu_task_util.h	\
			
 
				 	$(top_srcdir)/include/starpu_thread.h		\
			
 
				 	$(top_srcdir)/include/starpu_thread_util.h	\
			
 
				+	$(top_srcdir)/include/starpu_top.h		\
			
 
				+	$(top_srcdir)/include/starpu_tree.h		\
			
 
				+	$(top_srcdir)/include/starpu_util.h		\
			
 
				+	$(top_srcdir)/include/starpu_worker.h		\
			
 
				 	$(top_srcdir)/include/fstarpu_mod.f90		\
			
 
				 	$(top_srcdir)/mpi/include/starpu_mpi.h 		\
			
 
				 	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90		\
			
--- a/doc/doxygen/chapters/01building.doxy
+++ b/doc/doxygen/chapters/01building.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -47,14 +47,14 @@ no option is required, it will be detected automatically, otherwise
 
				 \subsection GettingSources Getting Sources
			
 
				 
			
 
				 StarPU's sources can be obtained from the download page of
			
 
				-the StarPU website (http://runtime.bordeaux.inria.fr/StarPU/files/).
			
 
				+the StarPU website (http://starpu.gforge.inria.fr/files/).
			
 
				 
			
 
				 All releases and the development tree of StarPU are freely available
			
 
				 on INRIA's gforge under the LGPL license. Some releases are available
			
 
				 under the BSD license.
			
 
				 
			
 
				 The latest release can be downloaded from the INRIA's gforge (http://gforge.inria.fr/frs/?group_id=1570) or
			
 
				-directly from the StarPU download page (http://runtime.bordeaux.inria.fr/StarPU/files/).
			
 
				+directly from the StarPU download page (http://starpu.gforge.inria.fr/files/).
			
 
				 
			
 
				 The latest nightly snapshot can be downloaded from the StarPU gforge website (http://starpu.gforge.inria.fr/testing/).
			
 
				 
			
@@ -227,7 +227,7 @@ Batch files are provided to run StarPU applications under Microsoft
 
				 Visual C. They are installed in <c>$STARPU_PATH/bin/msvc</c>.
			
 
				 
			
 
				 To execute a StarPU application, you first need to set the environment
			
 
				-variable <c>STARPU_PATH</c>.
			
 
				+variable \ref STARPU_PATH.
			
 
				 
			
 
				 \verbatim
			
 
				 c:\....> cd c:\cygwin\home\ci\starpu\
			
--- a/doc/doxygen/chapters/02basic_examples.doxy
+++ b/doc/doxygen/chapters/02basic_examples.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -323,8 +323,7 @@ additional examples, is available in the directory <c>gcc-plugin/examples</c>
 
				 of the StarPU distribution. These extensions map directly
			
 
				 to StarPU's main concepts: tasks, task implementations for CPU,
			
 
				 OpenCL, or CUDA, and registered data buffers. The standard C version
			
 
				-that uses StarPU's standard C programming interface is given in \ref
			
 
				-VectorScalingUsingStarPUAPI.
			
 
				+that uses StarPU's standard C programming interface is given in \ref VectorScalingUsingStarPUAPI.
			
 
				 
			
 
				 First of all, the vector-scaling task and its simple CPU implementation
			
 
				 has to be defined:
			
@@ -532,7 +531,7 @@ starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
 
				 The first argument, called the <b>data handle</b>, is an opaque pointer which
			
 
				 designates the array in StarPU. This is also the structure which is used to
			
 
				 describe which data is used by a task. The second argument is the node number
			
 
				-where the data originally resides. Here it is STARPU_MAIN_RAM since the array <c>vector</c> is in
			
 
				+where the data originally resides. Here it is ::STARPU_MAIN_RAM since the array <c>vector</c> is in
			
 
				 the main memory. Then comes the pointer <c>vector</c> where the data can be found in main memory,
			
 
				 the number of elements in the vector and the size of each element.
			
 
				 The following shows how to construct a StarPU task that will manipulate the
			
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -124,9 +124,9 @@ memory gets tight. This also means that by default StarPU will not cache buffer
 
				 allocations in main memory, since it does not know how much of the system memory
			
 
				 it can afford.
			
 
				 
			
 
				-In the case of GPUs, the \ref STARPU_LIMIT_CUDA_MEM, \ref
			
 
				-STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and \ref
			
 
				-STARPU_LIMIT_OPENCL_devid_MEM environment variables can be used to control how
			
 
				+In the case of GPUs, the \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
			
 
				+\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM environment variables
			
 
				+can be used to control how
			
 
				 much (in MiB) of the GPU device memory should be used at most by StarPU (their
			
 
				 default values are 90% of the available memory).
			
 
				 
			
@@ -139,27 +139,28 @@ involved, or if allocation fragmentation can become a problem), and when using
 
				 
			
 
				 It should be noted that by default only buffer allocations automatically
			
 
				 done by StarPU are accounted here, i.e. allocations performed through
			
 
				-<c>starpu_malloc_on_node()</c> which are used by the data interfaces
			
 
				+starpu_malloc_on_node() which are used by the data interfaces
			
 
				 (matrix, vector, etc.).  This does not include allocations performed by
			
 
				 the application through e.g. malloc(). It does not include allocations
			
 
				-performed through <c>starpu_malloc()</c> either, only allocations
			
 
				-performed explicitly with the \ref STARPU_MALLOC_COUNT flag (i.e. through
			
 
				-<c>starpu_malloc_flags(STARPU_MALLOC_COUNT)</c>) are taken into account.  If the
			
 
				+performed through starpu_malloc() either, only allocations
			
 
				+performed explicitly with the \ref STARPU_MALLOC_COUNT flag (i.e. by passing
			
 
				+the parameter \ref STARPU_MALLOC_COUNT when calling starpu_malloc_flags())
			
 
				+are taken into account.  If the
			
 
				 application wants to make StarPU aware of its own allocations, so that StarPU
			
 
				 knows precisely how much data is allocated, and thus when to evict allocation
			
 
				-caches or data out to the disk, \ref starpu_memory_allocate can be used to
			
 
				-specify an amount of memory to be accounted for. \ref starpu_memory_deallocate
			
 
				+caches or data out to the disk, starpu_memory_allocate() can be used to
			
 
				+specify an amount of memory to be accounted for. starpu_memory_deallocate()
			
 
				 can be used to account freed memory back. Those can for instance be used by data
			
 
				-interfaces with dynamic data buffers: instead of using starpu_malloc_on_node,
			
 
				+interfaces with dynamic data buffers: instead of using starpu_malloc_on_node(),
			
 
				 they would dynamically allocate data with malloc/realloc, and notify starpu of
			
 
				-the delta thanks to starpu_memory_allocate and starpu_memory_deallocate calls.
			
 
				+the delta thanks to starpu_memory_allocate() and starpu_memory_deallocate() calls.
			
 
				 
			
 
				-\ref starpu_memory_get_total and \ref starpu_memory_get_available
			
 
				+starpu_memory_get_total() and starpu_memory_get_available()
			
 
				 can be used to get an estimation of how much memory is available.
			
 
				-\ref starpu_memory_wait_available can also be used to block until an
			
 
				-amount of memory becomes available (but it may be preferrable to use
			
 
				-<c>starpu_memory_allocate(STARPU_MEMORY_WAIT)</c> to reserve that amount
			
 
				-immediately).
			
 
				+starpu_memory_wait_available() can also be used to block until an
			
 
				+amount of memory becomes available (but it may be preferrable to call
			
 
				+starpu_memory_allocate() with the parameter \ref STARPU_MEMORY_WAIT)
			
 
				+to reserve that amount immediately).
			
 
				 
			
 
				 \section HowToReduceTheMemoryFootprintOfInternalDataStructures How To Reduce The Memory Footprint Of Internal Data Structures
			
 
				 
			
@@ -185,12 +186,13 @@ The size of the various structures of StarPU can be printed by the
 
				 tests/microbenchs/display_structures_size.
			
 
				 
			
 
				 It is also often useless to submit *all* the tasks at the same time. One can
			
 
				-make the starpu_task_submit function block when a reasonable given number of
			
 
				-tasks have been submitted, by setting the STARPU_LIMIT_MIN_SUBMITTED_TASKS and
			
 
				-STARPU_LIMIT_MAX_SUBMITTED_TASKS environment variables, for instance:
			
 
				+make the starpu_task_submit() function block when a reasonable given number of
			
 
				+tasks have been submitted, by setting the \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS and
			
 
				+\ref STARPU_LIMIT_MAX_SUBMITTED_TASKS environment variables, for instance:
			
 
				 
			
 
				 <c>
			
 
				 export STARPU_LIMIT_MAX_SUBMITTED_TASKS=10000
			
 
				+
			
 
				 export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
			
 
				 </c>
			
 
				 
			
@@ -201,12 +203,12 @@ course this may reduce parallelism if the threshold is set too low. The precise
 
				 balance depends on the application task graph.
			
 
				 
			
 
				 An idea of how much memory is used for tasks and data handles can be obtained by
			
 
				-setting the STARPU_MAX_MEMORY_USE environment variable to 1.
			
 
				+setting the \ref STARPU_MAX_MEMORY_USE environment variable to 1.
			
 
				 
			
 
				 \section HowtoReuseMemory How to reuse memory
			
 
				 
			
 
				 When your application needs to allocate more data than the available amount of
			
 
				-memory usable by StarPU (given by \ref starpu_memory_get_available() ), the
			
 
				+memory usable by StarPU (given by starpu_memory_get_available()), the
			
 
				 allocation cache system can reuse data buffers used by previously executed
			
 
				 tasks. For that system to work with MPI tasks, you need to submit tasks progressively instead
			
 
				 of as soon as possible, because in the case of MPI receives, the allocation cache check for reusing data
			
@@ -214,16 +216,16 @@ buffers will be done at submission time, not at execution time.
 
				 
			
 
				 You have two options to control the task submission flow. The first one is by
			
 
				 controlling the number of submitted tasks during the whole execution. This can
			
 
				-be done whether by setting the environment variables \ref
			
 
				-STARPU_LIMIT_MAX_SUBMITTED_TASKS and \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS to
			
 
				+be done whether by setting the environment variables
			
 
				+\ref STARPU_LIMIT_MAX_SUBMITTED_TASKS and \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS to
			
 
				 tell StarPU when to stop submitting tasks and when to wake up and submit tasks
			
 
				-again, or by explicitely calling \ref starpu_task_wait_for_n_submitted() in
			
 
				+again, or by explicitely calling starpu_task_wait_for_n_submitted() in
			
 
				 your application code for finest grain control (for example, between two
			
 
				 iterations of a submission loop).
			
 
				 
			
 
				 The second option is to control the memory size of the allocation cache. This
			
 
				-can be done in the application by using jointly \ref
			
 
				-starpu_memory_get_available() and \ref starpu_memory_wait_available() to submit
			
 
				+can be done in the application by using jointly
			
 
				+starpu_memory_get_available() and starpu_memory_wait_available() to submit
			
 
				 tasks only when there is enough memory space to allocate the data needed by the
			
 
				 task, i.e when enough data are available for reuse in the allocation cache.
			
 
				 
			
--- a/doc/doxygen/chapters/06tasks.doxy
+++ b/doc/doxygen/chapters/06tasks.doxy
@@ -99,8 +99,8 @@ directory <c>examples/basic_examples/dynamic_handles.c</c>.
 
				 
			
 
				 Normally, the number of data handles given to a task is fixed in the
			
 
				 starpu_codelet::nbuffers codelet field. This field can however be set to
			
 
				-STARPU_VARIABLE_NBUFFERS, in which case the starpu_task::nbuffers task field
			
 
				-must be set, and the starpu_task::modes field (or starpu_task_dyn_modes field,
			
 
				+\ref STARPU_VARIABLE_NBUFFERS, in which case the starpu_task::nbuffers task field
			
 
				+must be set, and the starpu_task::modes field (or starpu_task::dyn_modes field,
			
 
				 see \ref SettingManyDataHandlesForATask) should be used to specify the modes for
			
 
				 the handles.
			
 
				 
			
@@ -493,13 +493,12 @@ structure as detected by <c>hwloc</c>. It means that for each object of the <c>h
 
				 topology (NUMA node, socket, cache, ...) a combined worker will be created. If
			
 
				 some nodes of the hierarchy have a big arity (e.g. many cores in a socket
			
 
				 without a hierarchy of shared caches), StarPU will create combined workers of
			
 
				-intermediate sizes. The variable \ref
			
 
				-STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
			
 
				-arity between levels of combined workers.
			
 
				+intermediate sizes. The variable \ref STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
			
 
				+permits to tune the maximum arity between levels of combined workers.
			
 
				 
			
 
				 The combined workers actually produced can be seen in the output of the
			
 
				-tool <c>starpu_machine_display</c> (the environment variable \ref
			
 
				-STARPU_SCHED has to be set to a combined worker-aware scheduler such
			
 
				+tool <c>starpu_machine_display</c> (the environment variable
			
 
				+\ref STARPU_SCHED has to be set to a combined worker-aware scheduler such
			
 
				 as <c>pheft</c> or <c>peager</c>).
			
 
				 
			
 
				 \subsection ConcurrentParallelTasks Concurrent Parallel Tasks
			
@@ -539,7 +538,7 @@ without any data access or execution content: as soon as its dependencies become
 
				 available, it will terminate, call the callbacks, and release dependencies.
			
 
				 
			
 
				 An intermediate solution is to define a codelet with its <c>where</c> field set
			
 
				-to STARPU_NOWHERE, for instance this:
			
 
				+to \ref STARPU_NOWHERE, for instance this:
			
 
				 
			
 
				 \code{.c}
			
 
				 struct starpu_codelet {
			
--- a/doc/doxygen/chapters/07data_management.doxy
+++ b/doc/doxygen/chapters/07data_management.doxy
@@ -111,7 +111,7 @@ starpu_data_idle_prefetch_on_node() variant can be used to issue the transfer
 
				 only when the bus is idle.
			
 
				 
			
 
				 Conversely, one can advise StarPU that some data will not be useful in the
			
 
				-close future by calling starpu_data_wont_use. StarPU will then write its value
			
 
				+close future by calling starpu_data_wont_use(). StarPU will then write its value
			
 
				 back to its home node, and evict it from GPUs when room is needed.
			
 
				 
			
 
				 \section PartitioningData Partitioning Data
			
@@ -194,7 +194,7 @@ but applications can also write their own data interfaces and filters, see
 
				 \section AsynchronousPartitioning Asynchronous Partitioning
			
 
				 
			
 
				 The partitioning functions described in the previous section are synchronous:
			
 
				-starpu_data_partition and starpu_data_unpartition both wait for all the tasks
			
 
				+starpu_data_partition() and starpu_data_unpartition() both wait for all the tasks
			
 
				 currently working on the data.  This can be a bottleneck for the application.
			
 
				 
			
 
				 An asynchronous API also exists, it works only on handles with sequential
			
@@ -217,7 +217,7 @@ struct starpu_data_filter f_vert =
 
				 starpu_data_partition_plan(handle, &f_vert, vert_handle);
			
 
				 \endcode
			
 
				 
			
 
				-starpu_data_partition_plan returns the handles for the partition in vert_handle.
			
 
				+starpu_data_partition_plan() returns the handles for the partition in vert_handle.
			
 
				 
			
 
				 One can submit tasks working on the main handle, but not yet on the vert_handle
			
 
				 handles. Now we submit the partitioning:
			
@@ -244,7 +244,7 @@ to submit unpartitioning (to get back to the initial handle) before submitting
 
				 another partitioning.
			
 
				 
			
 
				 It is also possible to activate several partitioning at the same time, in
			
 
				-read-only mode, by using starpu_data_partition_readonly_submit.  A complete
			
 
				+read-only mode, by using starpu_data_partition_readonly_submit(). A complete
			
 
				 example is available in <c>examples/filters/fmultiple_submit_readonly.c</c>.
			
 
				 
			
 
				 \section ManualPartitioning Manual Partitioning
			
@@ -450,9 +450,9 @@ before them, and the task running cl3 will always be run after them.
 
				 
			
 
				 If a lot of tasks use the commute access on the same set of data and a lot of
			
 
				 them are ready at the same time, it may become interesting to use an arbiter,
			
 
				-see \ref ConcurrentDataAccess .
			
 
				+see \ref ConcurrentDataAccess.
			
 
				 
			
 
				-\section ConcurrentDataAccess Concurrent Data accesses
			
 
				+\section ConcurrentDataAccess Concurrent Data Accesses
			
 
				 
			
 
				 When several tasks are ready and will work on several data, StarPU is faced with
			
 
				 the classical Dining Philosophers problem, and has to determine the order in
			
@@ -476,7 +476,7 @@ will acquire them arbiter by arbiter, in arbiter pointer value order.
 
				 
			
 
				 See the tests/datawizard/test_arbiter.cpp example.
			
 
				 
			
 
				-Arbiters however do not support the STARPU_REDUX flag yet.
			
 
				+Arbiters however do not support the ::STARPU_REDUX flag yet.
			
 
				 
			
 
				 \section TemporaryBuffers Temporary Buffers
			
 
				 
			
@@ -521,8 +521,8 @@ codelet is needed).
 
				 Some kernels sometimes need temporary data to achieve the computations, i.e. a
			
 
				 workspace. The application could allocate it at the start of the codelet
			
 
				 function, and free it at the end, but that would be costly. It could also
			
 
				-allocate one buffer per worker (similarly to \ref
			
 
				-HowToInitializeAComputationLibraryOnceForEachWorker), but that would
			
 
				+allocate one buffer per worker (similarly to \ref HowToInitializeAComputationLibraryOnceForEachWorker),
			
 
				+but that would
			
 
				 make them systematic and permanent. A more  optimized way is to use
			
 
				 the data access mode ::STARPU_SCRATCH, as examplified below, which
			
 
				 provides per-worker buffers without content consistency. The buffer is
			
@@ -706,7 +706,7 @@ happen that the task kernel would rather have some of the datas kept in the
 
				 main memory instead of copied in the GPU, a pivoting vector for instance.
			
 
				 This can be achieved by setting the starpu_codelet::specific_nodes flag to
			
 
				 1, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
			
 
				-starpu_codelet::nbuffers is greater than STARPU_NMAXBUFS) with the node numbers
			
 
				+starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
			
 
				 where data should be copied to, or -1 to let StarPU copy it to the memory node
			
 
				 where the task will be executed. For instance, with the following codelet:
			
 
				 
			
--- a/doc/doxygen/chapters/08scheduling.doxy
+++ b/doc/doxygen/chapters/08scheduling.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -197,10 +197,10 @@ methods of the policy.
 
				 
			
 
				 Make sure to have a look at the \ref API_Scheduling_Policy section, which
			
 
				 provides a list of the available functions for writing advanced schedulers, such
			
 
				-as starpu_task_expected_length, starpu_task_expected_data_transfer_time,
			
 
				-starpu_task_expected_energy, starpu_prefetch_task_input_node, etc. Other
			
 
				-useful functions include starpu_transfer_bandwidth, starpu_transfer_latency,
			
 
				-starpu_transfer_predict, ...
			
 
				+as starpu_task_expected_length(), starpu_task_expected_data_transfer_time(),
			
 
				+starpu_task_expected_energy(), starpu_prefetch_task_input_node(), etc. Other
			
 
				+useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
			
 
				+starpu_transfer_predict(), ...
			
 
				 
			
 
				 Usual functions can also be used on tasks, for instance one can do
			
 
				 
			
@@ -241,8 +241,8 @@ policy which automatically computes priorities by bottom-up rank.
 
				 
			
 
				 The idea is that while the application submits tasks, they are only pushed
			
 
				 to a bag of tasks. When the application is finished with submitting tasks,
			
 
				-it calls starpu_do_schedule (or starpu_task_wait_for_all, which calls
			
 
				-starpu_do_schedule), and the starpu_sched_policy::do_schedule method of the
			
 
				+it calls starpu_do_schedule() (or starpu_task_wait_for_all(), which calls
			
 
				+starpu_do_schedule()), and the starpu_sched_policy::do_schedule method of the
			
 
				 scheduler is called. This method calls _starpu_graph_compute_depths to compute
			
 
				 the bottom-up ranks, and then uses these rank to set priorities over tasks.
			
 
				 
			
@@ -257,9 +257,9 @@ All the \ref OnlinePerformanceTools and \ref OfflinePerformanceTools can
 
				 be used to get information about how well the execution proceeded, and thus the
			
 
				 overall quality of the execution.
			
 
				 
			
 
				-Precise debugging can also be performed by using the \ref
			
 
				-STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and \ref
			
 
				-STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
			
 
				+Precise debugging can also be performed by using the
			
 
				+\ref STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and
			
 
				+\ref STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
			
 
				 in these environment variables, StarPU will raise SIGTRAP when the task is being
			
 
				 scheduled, pushed, or popped by the scheduler. That means that when one notices
			
 
				 that a task is being scheduled in a seemingly odd way, one can just reexecute
			
--- a/doc/doxygen/chapters/09scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/09scheduling_contexts.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				 //  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -41,7 +41,7 @@ does not provide any worker list and leaves the Hypervisor assign
 
				 workers to each context according to their needs (\ref SchedulingContextHypervisor)
			
 
				 
			
 
				 Both cases require a call to the function
			
 
				-<c>starpu_sched_ctx_create</c>, which requires as input the worker
			
 
				+starpu_sched_ctx_create(), which requires as input the worker
			
 
				 list (the exact list or a NULL pointer) and a list of optional
			
 
				 parameters such as the scheduling policy, terminated by a 0. The
			
 
				 scheduling policy can be a character list corresponding to the name of
			
@@ -99,13 +99,13 @@ starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
 
				 \section SubmittingTasksToAContext Submitting Tasks To A Context
			
 
				 The application may submit tasks to several contexts either 
			
 
				 simultaneously or sequnetially. If several threads of submission
			
 
				-are used the function <c>starpu_sched_ctx_set_context</c> may be called just
			
 
				-before <c>starpu_task_submit</c>. Thus StarPU considers that 
			
 
				+are used the function starpu_sched_ctx_set_context() may be called just
			
 
				+before starpu_task_submit(). Thus StarPU considers that 
			
 
				 the current thread will submit tasks to the coresponding context.
			
 
				  
			
 
				 When the application may not assign a thread of submission to each
			
 
				 context, the id of the context must be indicated by using the
			
 
				-function <c>starpu_task_submit_to_ctx</c> or the field <c>STARPU_SCHED_CTX</c> 
			
 
				+function starpu_task_submit_to_ctx() or the field \ref STARPU_SCHED_CTX 
			
 
				 for starpu_task_insert().
			
 
				 
			
 
				 \section DeletingAContext Deleting A Context
			
@@ -153,7 +153,7 @@ the contexts these tasks start being submitted. However, if resources
 
				 are never allocated to the context the program will not terminate. 
			
 
				 If these tasks have low
			
 
				 priority the programmer can forbid the application to submit them
			
 
				-by calling the function <c>starpu_sched_ctx_stop_task_submission()</c>.
			
 
				+by calling the function starpu_sched_ctx_stop_task_submission().
			
 
				 
			
 
				 \section ContextsSharingWorkers Contexts Sharing Workers
			
 
				 
			
@@ -162,7 +162,7 @@ efficiently enough alone on these workers or when the application
 
				 decides to express a hierarchy of contexts. The workers apply an
			
 
				 alogrithm of ``Round-Robin'' to chose the context on which they will
			
 
				 ``pop'' next. By using the function
			
 
				-<c>starpu_sched_ctx_set_turn_to_other_ctx</c>, the programmer can impose
			
 
				+starpu_sched_ctx_set_turn_to_other_ctx(), the programmer can impose
			
 
				 the <c>workerid</c> to ``pop'' in the context <c>sched_ctx_id</c>
			
 
				 next.
			
 
				 
			
--- a/doc/doxygen/chapters/11debugging_tools.doxy
+++ b/doc/doxygen/chapters/11debugging_tools.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -9,8 +9,7 @@
 
				 /*! \page DebuggingTools Debugging Tools
			
 
				 
			
 
				 StarPU provides several tools to help debugging applications. Execution traces
			
 
				-can be generated and displayed graphically, see \ref
			
 
				-GeneratingTracesWithFxT.
			
 
				+can be generated and displayed graphically, see \ref GeneratingTracesWithFxT.
			
 
				 
			
 
				 \section DebuggingInGeneral TroubleShooting In General
			
 
				 
			
--- a/doc/doxygen/chapters/12online_performance_tools.doxy
+++ b/doc/doxygen/chapters/12online_performance_tools.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -82,28 +82,28 @@ wasted in pure StarPU overhead.
 
				 Calling starpu_profiling_worker_get_info() resets the profiling
			
 
				 information associated to a worker.
			
 
				 
			
 
				-To easily display all this information, the environment variable \ref
			
 
				-STARPU_WORKER_STATS can be set to 1 (in addition to setting \ref
			
 
				-STARPU_PROFILING to 1). A summary will then be displayed at program termination:
			
 
				+To easily display all this information, the environment variable
			
 
				+\ref STARPU_WORKER_STATS can be set to 1 (in addition to setting
			
 
				+\ref STARPU_PROFILING to 1). A summary will then be displayed at program termination:
			
 
				 
			
 
				 \verbatim
			
 
				 Worker stats:
			
 
				-CUDA 0.0 (4.7 GiB)      
			
 
				+CUDA 0.0 (4.7 GiB)
			
 
				 	480 task(s)
			
 
				 	total: 1574.82 ms executing: 1510.72 ms sleeping: 0.00 ms overhead 64.10 ms
			
 
				 	325.217970 GFlop/s
			
 
				 
			
 
				-CPU 0                           
			
 
				+CPU 0
			
 
				 	22 task(s)
			
 
				 	total: 1574.82 ms executing: 1364.81 ms sleeping: 0.00 ms overhead 210.01 ms
			
 
				 	7.512057 GFlop/s
			
 
				 
			
 
				-CPU 1                           
			
 
				+CPU 1
			
 
				 	14 task(s)
			
 
				 	total: 1574.82 ms executing: 1500.13 ms sleeping: 0.00 ms overhead 74.69 ms
			
 
				 	6.675853 GFlop/s
			
 
				 
			
 
				-CPU 2                           
			
 
				+CPU 2
			
 
				 	14 task(s)
			
 
				 	total: 1574.82 ms executing: 1553.12 ms sleeping: 0.00 ms overhead 21.70 ms
			
 
				 	7.152886 GFlop/s
			
@@ -113,8 +113,8 @@ The number of GFlops is available because the starpu_task::flops field of the
 
				 tasks were filled (or STARPU_FLOPS used in starpu_task_insert).
			
 
				 
			
 
				 When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
			
 
				-possible to use the tool <c>starpu_workers_activity</c> (see \ref
			
 
				-MonitoringActivity) to generate a graphic showing the evolution of
			
 
				+possible to use the tool <c>starpu_workers_activity</c> (see
			
 
				+\ref MonitoringActivity) to generate a graphic showing the evolution of
			
 
				 these values during the time, for the different workers.
			
 
				 
			
 
				 \subsection Bus-relatedFeedback Bus-related Feedback
			
@@ -139,8 +139,8 @@ CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
 
				 \endverbatim
			
 
				 
			
 
				 Statistics about the data transfers which were performed and temporal average
			
 
				-of bandwidth usage can be obtained by setting the environment variable \ref
			
 
				-STARPU_BUS_STATS to 1; a summary will then be displayed at program termination:
			
 
				+of bandwidth usage can be obtained by setting the environment variable
			
 
				+\ref STARPU_BUS_STATS to 1; a summary will then be displayed at program termination:
			
 
				 
			
 
				 \verbatim
			
 
				 Data transfer stats:
			
@@ -360,7 +360,7 @@ struct starpu_codelet cl = {
 
				 <li>
			
 
				 Measured at runtime and refined by regression (model types
			
 
				 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This
			
 
				-still assumes performance regularity, but works 
			
 
				+still assumes performance regularity, but works
			
 
				 with various data input sizes, by applying regression over observed
			
 
				 execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression
			
 
				 form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
			
@@ -380,8 +380,8 @@ so as to feed the performance model for a variety of
 
				 inputs. The application can also provide the measurements explictly by
			
 
				 using the function starpu_perfmodel_update_history(). The tools
			
 
				 <c>starpu_perfmodel_display</c> and <c>starpu_perfmodel_plot</c> can
			
 
				-be used to observe how much the performance model is calibrated (\ref
			
 
				-PerformanceModelCalibration); when their output look good,
			
 
				+be used to observe how much the performance model is calibrated
			
 
				+(\ref PerformanceModelCalibration); when their output look good,
			
 
				 \ref STARPU_CALIBRATE can be reset to <c>0</c> to let
			
 
				 StarPU use the resulting performance model without recording new measures, and
			
 
				 \ref STARPU_SCHED can be set to <c>dmda</c> to benefit from the performance models. If
			
--- a/doc/doxygen/chapters/13offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/13offline_performance_tools.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -136,12 +136,12 @@ By default, all tasks are displayed using a green color. To display tasks with
 
				 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
			
 
				 
			
 
				 To identify tasks precisely, the application can set the starpu_task::tag_id field of the
			
 
				-task (or use STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
			
 
				+task (or use \ref STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
			
 
				 enough version of vite (>= r1430) and the
			
 
				 \ref enable-paje-codelet-details "--enable-paje-codelet-details"
			
 
				 StarPU configure option, the value of the tag will show up in the trace.
			
 
				 
			
 
				-It can also set the starpu_task::name field of the task (or use STARPU_NAME)
			
 
				+It can also set the starpu_task::name field of the task (or use \ref STARPU_NAME)
			
 
				 when using starpu_task_insert()), to replace in traces the name of the codelet
			
 
				 with an arbitrarily chosen name.
			
 
				 
			
@@ -183,8 +183,8 @@ This will create a <c>tasks.rec</c> file in the current directory.  This file
 
				 is in the recutils format, i.e. <c>Field: value</c> lines, and empty lines to
			
 
				 separate each task.  This can be used as a convenient input for various ad-hoc
			
 
				 analysis tools.  The performance models can be opened for instance by using
			
 
				-\ref starpu_perfmodel_load_symbol and then using
			
 
				-\ref starpu_perfmodel_history_based_expected_perf
			
 
				+starpu_perfmodel_load_symbol() and then using
			
 
				+starpu_perfmodel_history_based_expected_perf().
			
 
				 
			
 
				 \subsection MonitoringActivity Monitoring Activity
			
 
				 
			
@@ -241,7 +241,7 @@ the time of the whole execution. This can be achieved by calling
 
				 starpu_fxt_autostart_profiling(0);
			
 
				 \endverbatim
			
 
				 
			
 
				-before calling starpu_init, to prevent tracing from starting immediately. Then
			
 
				+before calling starpu_init(), to prevent tracing from starting immediately. Then
			
 
				 
			
 
				 \verbatim
			
 
				 starpu_fxt_start_profiling();
			
@@ -329,8 +329,8 @@ run with the tool <c>gnuplot</c>, which shows the corresponding curve.
 
				 \image html starpu_non_linear_memset_regression_based.png
			
 
				 \image latex starpu_non_linear_memset_regression_based.eps "" width=\textwidth
			
 
				 
			
 
				-When the field starpu_task::flops is set (or STARPU_FLOPS is passed to
			
 
				-starpu_task_insert), <c>starpu_perfmodel_plot</c> can directly draw a GFlops
			
 
				+When the field starpu_task::flops is set (or \ref STARPU_FLOPS is passed to
			
 
				+starpu_task_insert()), <c>starpu_perfmodel_plot</c> can directly draw a GFlops
			
 
				 curve, by simply adding the <c>-f</c> option:
			
 
				 
			
 
				 \verbatim
			
@@ -418,7 +418,7 @@ $ starpu_paje_state_stats native.trace simgrid.trace
 
				 \endverbatim
			
 
				 
			
 
				 An other way to get statistics of StarPU states (without installing R and
			
 
				-pj_dump) is to use the starpu_trace_state_stats.py script which parses the
			
 
				+pj_dump) is to use the <c>starpu_trace_state_stats.py</c> script which parses the
			
 
				 generated trace.rec file instead of the paje.trace file. The output is similar
			
 
				 to the previous script but it doesn't need any dependencies.
			
 
				 
			
@@ -444,7 +444,7 @@ $ python starpu_trace_state_stats.py trace.rec | column -t -s ","
 
				 "chol_model_22"  165	Task	64712.07
			
 
				 \endverbatim
			
 
				 
			
 
				-starpu_trace_state_stats.py can also be used to compute the different
			
 
				+<c>starpu_trace_state_stats.py</c> can also be used to compute the different
			
 
				 efficiencies. Refer to the usage description to show some examples.
			
 
				 
			
 
				 And one can plot histograms of execution times, of several states for instance:
			
--- a/doc/doxygen/chapters/15out_of_core.doxy
+++ b/doc/doxygen/chapters/15out_of_core.doxy
@@ -21,9 +21,9 @@ If the disk backend provides an alloc method, StarPU can then start using it
 
				 to allocate room and store data there with the write method, without user
			
 
				 intervention.
			
 
				 
			
 
				-The user can also use starpu_disk_open to explicitly open an object within the
			
 
				+The user can also use starpu_disk_open() to explicitly open an object within the
			
 
				 disk, e.g. a file name in the stdio or unistd cases, or a database key in the
			
 
				-leveldb case, and then use starpu_*_register functions to turn it into a StarPU
			
 
				+leveldb case, and then use <c>starpu_*_register</c> functions to turn it into a StarPU
			
 
				 data handle. StarPU will then automatically read and write data as appropriate.
			
 
				 
			
 
				 \section UseANewDiskMemory Use a new disk memory
			
@@ -65,7 +65,7 @@ There are various ways to operate a disk memory node, described by the structure
 
				 starpu_disk_ops. For instance, the variable #starpu_disk_unistd_ops
			
 
				 uses read/write functions.
			
 
				 
			
 
				-All structures are in \ref API_Out_Of_Core .
			
 
				+All structures are in \ref API_Out_Of_Core.
			
 
				 
			
 
				 \section ExampleDiskCopy Examples: disk_copy
			
 
				 
			
--- a/doc/doxygen/chapters/16mpi_support.doxy
+++ b/doc/doxygen/chapters/16mpi_support.doxy
@@ -256,13 +256,13 @@ processed in the first step of the next loop.
 
				 </li>
			
 
				 </ol>
			
 
				 
			
 
				-\ref MPIPtpCommunication "Communication" gives the list of all the
			
 
				+\ref MPIPtpCommunication gives the list of all the
			
 
				 point to point communications defined in StarPU-MPI.
			
 
				 
			
 
				 \section ExchangingUserDefinedDataInterface Exchanging User Defined Data Interface
			
 
				 
			
 
				-New data interfaces defined as explained in \ref
			
 
				-DefiningANewDataInterface can also be used within StarPU-MPI and
			
 
				+New data interfaces defined as explained in \ref DefiningANewDataInterface
			
 
				+can also be used within StarPU-MPI and
			
 
				 exchanged between nodes. Two functions needs to be defined through the
			
 
				 type starpu_data_interface_ops. The function
			
 
				 starpu_data_interface_ops::pack_data takes a handle and returns a
			
@@ -374,7 +374,7 @@ exchange the content of the handle. All MPI nodes then process the whole task
 
				 graph, and StarPU automatically determines which node actually execute which
			
 
				 task, and trigger the required MPI transfers.
			
 
				 
			
 
				-The list of functions is described in \ref MPIInsertTask "MPI Insert Task".
			
 
				+The list of functions is described in \ref MPIInsertTask.
			
 
				 
			
 
				 Here an stencil example showing how to use starpu_mpi_task_insert(). One
			
 
				 first needs to define a distribution function which specifies the
			
@@ -607,7 +607,7 @@ latest value on the original home node.
 
				 
			
 
				 \section MPICollective MPI Collective Operations
			
 
				 
			
 
				-The functions are described in \ref MPICollectiveOperations "MPI Collective Operations".
			
 
				+The functions are described in \ref MPICollectiveOperations.
			
 
				 
			
 
				 \code{.c}
			
 
				 if (rank == root)
			
@@ -667,8 +667,8 @@ them!
 
				 
			
 
				 \section MPIDebug Debugging MPI
			
 
				 
			
 
				-Communication trace will be enabled when the environment variable \ref
			
 
				-STARPU_MPI_COMM is set to 1, and StarPU has been configured with the
			
 
				+Communication trace will be enabled when the environment variable
			
 
				+\ref STARPU_MPI_COMM is set to 1, and StarPU has been configured with the
			
 
				 option \ref enable-verbose "--enable-verbose".
			
 
				 
			
 
				 Statistics will be enabled for the communication cache when the
			
--- a/doc/doxygen/chapters/18mic_scc_support.doxy
+++ b/doc/doxygen/chapters/18mic_scc_support.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -61,8 +61,8 @@ MIC programs are started from the host. StarPU automatically
 
				 starts the same program on MIC devices. It however needs to get
			
 
				 the MIC-cross-built binary. It will look for the file given by the
			
 
				 environment variable \ref STARPU_MIC_SINK_PROGRAM_NAME or in the
			
 
				-directory given by the environment variable \ref
			
 
				-STARPU_MIC_SINK_PROGRAM_PATH, or in the field
			
 
				+directory given by the environment variable \ref STARPU_MIC_SINK_PROGRAM_PATH,
			
 
				+or in the field
			
 
				 starpu_conf::mic_sink_program_path. It will also look in the current
			
 
				 directory for the same binary name plus the suffix <c>-mic</c> or
			
 
				 <c>_mic</c>.
			
--- a/doc/doxygen/chapters/20socl_opencl_extensions.doxy
+++ b/doc/doxygen/chapters/20socl_opencl_extensions.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -75,8 +75,8 @@ Number of platforms:	2
 
				 $
			
 
				 \endverbatim
			
 
				 
			
 
				-To enable the use of CPU cores via OpenCL, one can set the STARPU_OPENCL_ON_CPUS
			
 
				-environment variable to 1 and STARPU_NCPUS to 0 (to avoid using CPUs both via
			
 
				+To enable the use of CPU cores via OpenCL, one can set the \ref STARPU_OPENCL_ON_CPUS
			
 
				+environment variable to 1 and \ref STARPU_NCPUS to 0 (to avoid using CPUs both via
			
 
				 the OpenCL driver and the normal CPU driver).
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/21simgrid.doxy
+++ b/doc/doxygen/chapters/21simgrid.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -100,10 +100,10 @@ $ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
 
				     matvecmult does not have a perfmodel, or is not calibrated enough
			
 
				 \endverbatim
			
 
				 
			
 
				-The number of devices can be chosen as usual with \ref STARPU_NCPU, \ref
			
 
				-STARPU_NCUDA, and \ref STARPU_NOPENCL, and the amount of GPU memory
			
 
				-with \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM, \ref
			
 
				-STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM.
			
 
				+The number of devices can be chosen as usual with \ref STARPU_NCPU,
			
 
				+\ref STARPU_NCUDA, and \ref STARPU_NOPENCL, and the amount of GPU memory
			
 
				+with \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
			
 
				+\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM.
			
 
				 
			
 
				 \section SimulationOnAnotherMachine Simulation On Another Machine
			
 
				 
			
@@ -139,13 +139,13 @@ be extended as well), change the available GPU memory size, PCI memory bandwidth
 
				 
			
 
				 The simulation can be tweaked, to be able to tune it between a very accurate
			
 
				 simulation and a very simple simulation (which is thus close to scheduling
			
 
				-theory results), see the \ref STARPU_SIMGRID_CUDA_MALLOC_COST and \ref
			
 
				-STARPU_SIMGRID_CUDA_QUEUE_COST environment variables.
			
 
				+theory results), see the \ref STARPU_SIMGRID_CUDA_MALLOC_COST and
			
 
				+\ref STARPU_SIMGRID_CUDA_QUEUE_COST environment variables.
			
 
				 
			
 
				 \section SimulationMPIApplications MPI applications
			
 
				 
			
 
				 StarPU-MPI applications can also be run in simgrid mode. It needs to be compiled
			
 
				-with smpicc, and run using the starpu_smpirun script, for instance:
			
 
				+with smpicc, and run using the <c>starpu_smpirun</c> script, for instance:
			
 
				 
			
 
				 \verbatim
			
 
				 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
			
@@ -178,7 +178,7 @@ to starpu data registration functions, instead of allocating data. This will
 
				 however require the application to take care of not trying to access the data,
			
 
				 and will not work in MPI mode, which performs transfers.
			
 
				 
			
 
				-Another way is to pass the STARPU_MALLOC_SIMULATION_FOLDED flag to the
			
 
				+Another way is to pass the \ref STARPU_MALLOC_SIMULATION_FOLDED flag to the
			
 
				 starpu_malloc_flags() function. This will make it allocate a memory area which
			
 
				 one can read/write, but optimized so that this does not actually consume
			
 
				 memory. Of course, the values read from such area will be bogus, but this allows
			
@@ -196,8 +196,8 @@ Note however that this folding is done by remapping the same file several times,
 
				 and Linux kernels will also refuse to create too many memory areas. <c>sysctl
			
 
				 vm.max_map_count</c> can be used to check and change the default (65535). By
			
 
				 default, StarPU uses a 1MiB file, so it hopefully fits in the CPU cache. This
			
 
				-however limits the amount of such folded memory to a bit below 64GiB. The \ref
			
 
				-STARPU_MALLOC_SIMULATION_FOLD environment variable can be used to increase the
			
 
				+however limits the amount of such folded memory to a bit below 64GiB. The
			
 
				+\ref STARPU_MALLOC_SIMULATION_FOLD environment variable can be used to increase the
			
 
				 size of the file.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/23clustering_a_machine.doxy
+++ b/doc/doxygen/chapters/23clustering_a_machine.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2015 Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2015 CNRS
			
 
				+ * Copyright (C) 2015, 2016 CNRS
			
 
				  * Copyright (C) 2015 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -46,7 +46,7 @@ object, of the type <c>hwloc_obj_type_t</c>. More can be found in the
 
				 documentation</a>.
			
 
				 
			
 
				 Once a cluster is created, the full machine is represented with an opaque
			
 
				-structure named <c>starpu_cluster_machine</c>. This can be printed to show the
			
 
				+structure starpu_cluster_machine. This can be printed to show the
			
 
				 current machine state.
			
 
				 
			
 
				 \code{.c}
			
@@ -203,7 +203,8 @@ Note that the OpenMP mode is the default one both for clusters and
 
				 contexts. The result of a cluster creation is a woken up master worker
			
 
				 and sleeping "slaves" which allow the master to run tasks on their
			
 
				 resources. To create a cluster with woken up workers one can use the
			
 
				-flag <c>STARPU_SCHED_CTX_AWAKE_WORKERS</c> with the scheduling context
			
 
				-API and <c>STARPU_CLUSTER_AWAKE_WORKERS</c> with the cluster API as
			
 
				+flag \ref STARPU_SCHED_CTX_AWAKE_WORKERS with the scheduling context
			
 
				+API and \ref STARPU_CLUSTER_AWAKE_WORKERS with the cluster API as
			
 
				 parameter to the creation function.
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
@@ -85,9 +85,8 @@ execution of all tasks.
 
				 \anchor STARPU_OPENCL_ON_CPUS
			
 
				 \addindex __env__STARPU_OPENCL_ON_CPUS
			
 
				 By default, the OpenCL driver only enables GPU and accelerator
			
 
				-devices. By setting the environment variable \ref
			
 
				-STARPU_OPENCL_ON_CPUS to 1, the OpenCL driver will also enable CPU
			
 
				-devices.
			
 
				+devices. By setting the environment variable \ref STARPU_OPENCL_ON_CPUS
			
 
				+to 1, the OpenCL driver will also enable CPU devices.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
			
@@ -95,9 +94,8 @@ devices.
 
				 \anchor STARPU_OPENCL_ONLY_ON_CPUS
			
 
				 \addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
			
 
				 By default, the OpenCL driver enables GPU and accelerator
			
 
				-devices. By setting the environment variable \ref
			
 
				-STARPU_OPENCL_ONLY_ON_CPUS to 1, the OpenCL driver will ONLY enable
			
 
				-CPU devices.
			
 
				+devices. By setting the environment variable \ref STARPU_OPENCL_ONLY_ON_CPUS
			
 
				+to 1, the OpenCL driver will ONLY enable CPU devices.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_NMIC</dt>
			
@@ -153,8 +151,8 @@ and <c>STARPU_WORKERS_CPUID = "0 2 1 3"</c>, the CUDA device will be controlled
 
				 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
			
 
				 the logical CPUs #1 and #3 will be used by the CPU workers.
			
 
				 
			
 
				-If the number of workers is larger than the array given in \ref
			
 
				-STARPU_WORKERS_CPUID, the workers are bound to the logical CPUs in a
			
 
				+If the number of workers is larger than the array given in
			
 
				+\ref STARPU_WORKERS_CPUID, the workers are bound to the logical CPUs in a
			
 
				 round-robin fashion: if <c>STARPU_WORKERS_CPUID = "0 1"</c>, the first
			
 
				 and the third (resp. second and fourth) workers will be put on CPU #0
			
 
				 (resp. CPU #1).
			
@@ -334,6 +332,13 @@ todo
 
				 todo
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_MIC_PROGRAM_PATH</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_MIC_PROGRAM_PATH
			
 
				+\addindex __env__STARPU_MIC_PROGRAM_PATH
			
 
				+todo
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
			
@@ -452,8 +457,8 @@ Enable on-line performance monitoring (\ref EnablingOn-linePerformanceMonitoring
 
				 <dd>
			
 
				 \anchor SOCL_OCL_LIB_OPENCL
			
 
				 \addindex __env__SOCL_OCL_LIB_OPENCL
			
 
				-THE SOCL test suite is only run when the environment variable \ref
			
 
				-SOCL_OCL_LIB_OPENCL is defined. It should contain the location
			
 
				+THE SOCL test suite is only run when the environment variable
			
 
				+\ref SOCL_OCL_LIB_OPENCL is defined. It should contain the location
			
 
				 of the file <c>libOpenCL.so</c> of the OCL ICD implementation.
			
 
				 </dd>
			
 
				 
			
@@ -522,6 +527,21 @@ When set to 1 (which is the default), CUDA task and transfer queueing costs are
 
				 taken into account in simgrid mode.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_PCI_FLAT</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_PCI_FLAT
			
 
				+\addindex __env__STARPU_PCI_FLAT
			
 
				+When unset or set to to 0, the platform file created for simgrid will
			
 
				+contain PCI bandwidths and routes.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_SIMGRID_QUEUE_MALLOC_COST</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_SIMGRID_QUEUE_MALLOC_COST
			
 
				+\addindex __env__STARPU_SIMGRID_QUEUE_MALLOC_COST
			
 
				+When unset or set to 1, simulate within simgrid the GPU transfer queueing.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_MALLOC_SIMULATION_FOLD</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_MALLOC_SIMULATION_FOLD
			
@@ -546,6 +566,15 @@ configuration files. The default is <c>$HOME</c> on Unix environments,
 
				 and <c>$USERPROFILE</c> on Windows environments.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_PATH</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_PATH
			
 
				+\addindex __env__STARPU_PATH
			
 
				+Only used on  Windows environments.
			
 
				+This specifies the main directory in which StarPU is installed
			
 
				+(\ref RunningABasicStarPUApplicationOnMicrosoft)
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_PERF_MODEL_DIR</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_PERF_MODEL_DIR
			
@@ -731,8 +760,8 @@ Setting it enables allocation cache buffer reuse in main memory.
 
				 \addindex __env__STARPU_LIMIT_MIN_SUBMITTED_TASKS    
			
 
				 This variable allows the user to control the task submission flow by specifying
			
 
				 to StarPU a submitted task threshold to wait before unblocking task submission. This
			
 
				-variable has to be used in conjunction with \ref
			
 
				-STARPU_LIMIT_MAX_SUBMITTED_TASKS which puts the task submission thread to
			
 
				+variable has to be used in conjunction with \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS
			
 
				+which puts the task submission thread to
			
 
				 sleep.  Setting it enables allocation cache buffer reuse in main memory.
			
 
				 </dd>
			
 
				 
			
@@ -809,8 +838,8 @@ end of the execution of an application (\ref DataStatistics).
 
				 When set to a value other than 0, allows to make StarPU print an error
			
 
				 message whenever StarPU does not terminate any task for the given time (in µs),
			
 
				 but lets the application continue normally. Should
			
 
				-be used in combination with \ref STARPU_WATCHDOG_CRASH (see \ref
			
 
				-DetectionStuckConditions).
			
 
				+be used in combination with \ref STARPU_WATCHDOG_CRASH
			
 
				+(see \ref DetectionStuckConditions).
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_WATCHDOG_CRASH</dt>
			
@@ -880,6 +909,24 @@ the current time() (unless simgrid mode is enabled, in which case it is always
 
				 0). STARPU_RAND_SEED allows to set the seed to a specific value.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_IDLE_TIME</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_IDLE_TIME
			
 
				+\addindex __env__STARPU_IDLE_TIME
			
 
				+When set to a value being a valid filename, a corresponding file
			
 
				+will be created when shutting down StarPU. The file will contain the
			
 
				+sum of all the workers' idle time.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_GLOBAL_ARBITER</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_GLOBAL_ARBITER
			
 
				+\addindex __env__STARPU_GLOBAL_ARBITER
			
 
				+When set to a positive value, StarPU will create a arbiter, which
			
 
				+implements an advanced but centralized management of concurrent data
			
 
				+accesses, see \ref ConcurrentDataAccess for the details.
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section ConfiguringTheHypervisor Configuring The Hypervisor
			
--- a/doc/doxygen/chapters/41configure_options.doxy
+++ b/doc/doxygen/chapters/41configure_options.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -553,7 +553,7 @@ Disable the build of tests.
 
				 <dd>
			
 
				 \anchor enable-sc-hypervisor
			
 
				 \addindex __configure__--enable-sc-hypervisor
			
 
				-Enable the Scheduling Context Hypervisor plugin(\ref SchedulingContextHypervisor).
			
 
				+Enable the Scheduling Context Hypervisor plugin (\ref SchedulingContextHypervisor).
			
 
				 By default, it is disabled.
			
 
				 </dd>
			
 
				 
			
--- a/doc/doxygen/chapters/45files.doxy
+++ b/doc/doxygen/chapters/45files.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				 */
			
@@ -13,6 +13,7 @@
 
				 \file starpu.h
			
 
				 \file starpu_bitmap.h
			
 
				 \file starpu_bound.h
			
 
				+\file starpu_clusters_util.h
			
 
				 \file starpu_cublas.h
			
 
				 \file starpu_cuda.h
			
 
				 \file starpu_data_filters.h
			
@@ -25,15 +26,18 @@
 
				 \file starpu_fxt.h
			
 
				 \file starpu_hash.h
			
 
				 \file starpu_mic.h
			
 
				+\file starpu_mod.f90
			
 
				 \file starpu_opencl.h
			
 
				 \file starpu_openmp.h
			
 
				 \file starpu_perfmodel.h
			
 
				 \file starpu_profiling.h
			
 
				 \file starpu_rand.h
			
 
				 \file starpu_scc.h
			
 
				+\file starpu_sched_component.h
			
 
				 \file starpu_sched_ctx.h
			
 
				 \file starpu_sched_ctx_hypervisor.h
			
 
				 \file starpu_scheduler.h
			
 
				+\file starpu_simgrid_wrap.h
			
 
				 \file starpu_sink.h
			
 
				 \file starpu_stdlib.h
			
 
				 \file starpu_task_bundle.h
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -312,8 +312,8 @@ Is an array of ::starpu_data_access_mode. It describes the required
 
				 access modes to the data needed by the codelet (e.g. ::STARPU_RW).
			
 
				 The number of entries in this array must be specified in the field
			
 
				 starpu_codelet::nbuffers. This field should be used for codelets having a
			
 
				-number of datas greater than \ref STARPU_NMAXBUFS (see \ref
			
 
				-SettingManyDataHandlesForATask). When defining a codelet, one
			
 
				+number of datas greater than \ref STARPU_NMAXBUFS
			
 
				+(see \ref SettingManyDataHandlesForATask). When defining a codelet, one
			
 
				 should either define this field or the field starpu_codelet::modes defined above.
			
 
				 
			
 
				 \var unsigned starpu_codelet::specific_nodes
			
@@ -334,8 +334,8 @@ Optional field. When starpu_codelet::specific_nodes is 1, this specifies
 
				 the memory nodes where each data should be sent to for task execution.
			
 
				 The number of entries in this array is starpu_codelet::nbuffers.
			
 
				 This field should be used for codelets having a
			
 
				-number of datas greater than \ref STARPU_NMAXBUFS (see \ref
			
 
				-SettingManyDataHandlesForATask). When defining a codelet, one
			
 
				+number of datas greater than \ref STARPU_NMAXBUFS
			
 
				+(see \ref SettingManyDataHandlesForATask). When defining a codelet, one
			
 
				 should either define this field or the field starpu_codelet::nodes defined
			
 
				 above.
			
 
				 
			
@@ -367,8 +367,8 @@ Various flags for the codelet.
 
				 \fn void starpu_codelet_init(struct starpu_codelet *cl)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Initialize \p cl with default values. Codelets should
			
 
				-preferably be initialized statically as shown in \ref
			
 
				-DefiningACodelet. However such a initialisation is not always
			
 
				+preferably be initialized statically as shown in
			
 
				+\ref DefiningACodelet. However such a initialisation is not always
			
 
				 possible, e.g. when using C++.
			
 
				 
			
 
				 \struct starpu_data_descr
			
@@ -448,8 +448,8 @@ It is an array of ::starpu_data_access_mode. It describes the required
 
				 access modes to the data needed by the codelet (e.g. ::STARPU_RW).
			
 
				 The number of entries in this array must be specified in the field
			
 
				 starpu_codelet::nbuffers. This field should be used for codelets having a
			
 
				-number of datas greater than \ref STARPU_NMAXBUFS (see \ref
			
 
				-SettingManyDataHandlesForATask). When defining a codelet, one
			
 
				+number of datas greater than \ref STARPU_NMAXBUFS
			
 
				+(see \ref SettingManyDataHandlesForATask). When defining a codelet, one
			
 
				 should either define this field or the field starpu_task::modes defined above.
			
 
				 
			
 
				 \var void *starpu_task::cl_arg
			
@@ -686,8 +686,8 @@ starpu_task::nbuffers if the former is STARPU_VARIABLE_NBUFFERS.
 
				 Return the \p i th data handle of the given task. If the task
			
 
				 is defined with a static or dynamic number of handles, will either
			
 
				 return the \p i th element of the field starpu_task::handles or the \p
			
 
				-i th element of the field starpu_task::dyn_handles (see \ref
			
 
				-SettingManyDataHandlesForATask)
			
 
				+i th element of the field starpu_task::dyn_handles
			
 
				+(see \ref SettingManyDataHandlesForATask)
			
 
				 
			
 
				 \def STARPU_TASK_SET_HANDLE(task, handle, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
@@ -695,8 +695,8 @@ Set the \p i th data handle of the given task with the given
 
				 dat handle. If the task is defined with a static or dynamic number of
			
 
				 handles, will either set the \p i th element of the field
			
 
				 starpu_task::handles or the \p i th element of the field
			
 
				-starpu_task::dyn_handles (see \ref
			
 
				-SettingManyDataHandlesForATask)
			
 
				+starpu_task::dyn_handles
			
 
				+(see \ref SettingManyDataHandlesForATask)
			
 
				 
			
 
				 \def STARPU_CODELET_GET_MODE(codelet, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
@@ -704,8 +704,8 @@ Return the access mode of the \p i th data handle of the given
 
				 codelet. If the codelet is defined with a static or dynamic number of
			
 
				 handles, will either return the \p i th element of the field
			
 
				 starpu_codelet::modes or the \p i th element of the field
			
 
				-starpu_codelet::dyn_modes (see \ref
			
 
				-SettingManyDataHandlesForATask)
			
 
				+starpu_codelet::dyn_modes
			
 
				+(see \ref SettingManyDataHandlesForATask)
			
 
				 
			
 
				 \def STARPU_CODELET_SET_MODE(codelet, mode, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
@@ -713,8 +713,8 @@ Set the access mode of the \p i th data handle of the given
 
				 codelet. If the codelet is defined with a static or dynamic number of
			
 
				 handles, will either set the \p i th element of the field
			
 
				 starpu_codelet::modes or the \p i th element of the field
			
 
				-starpu_codelet::dyn_modes (see \ref
			
 
				-SettingManyDataHandlesForATask)
			
 
				+starpu_codelet::dyn_modes
			
 
				+(see \ref SettingManyDataHandlesForATask)
			
 
				 
			
 
				 \def STARPU_TASK_GET_MODE(task, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
@@ -722,8 +722,8 @@ Return the access mode of the \p i th data handle of the given
 
				 task. If the task is defined with a static or dynamic number of
			
 
				 handles, will either return the \p i th element of the field
			
 
				 starpu_task::modes or the \p i th element of the field
			
 
				-starpu_task::dyn_modes (see \ref
			
 
				-SettingManyDataHandlesForATask)
			
 
				+starpu_task::dyn_modes
			
 
				+(see \ref SettingManyDataHandlesForATask)
			
 
				 
			
 
				 \def STARPU_TASK_SET_MODE(task, mode, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
@@ -731,8 +731,8 @@ Set the access mode of the \p i th data handle of the given
 
				 task. If the task is defined with a static or dynamic number of
			
 
				 handles, will either set the \p i th element of the field
			
 
				 starpu_task::modes or the \p i th element of the field
			
 
				-starpu_task::dyn_modes (see \ref
			
 
				-SettingManyDataHandlesForATask)
			
 
				+starpu_task::dyn_modes
			
 
				+(see \ref SettingManyDataHandlesForATask)
			
 
				 
			
 
				 \fn struct starpu_task *starpu_task_create(void)
			
 
				 \ingroup API_Codelet_And_Tasks
			
--- a/doc/doxygen/chapters/api/data_interfaces.doxy
+++ b/doc/doxygen/chapters/api/data_interfaces.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -353,8 +353,8 @@ Return the interface associated with \p handle on \p memory_node.
 
				 
			
 
				 Each data interface is provided with a set of field access functions.
			
 
				 The ones using a void * parameter aimed to be used in codelet
			
 
				-implementations (see for example the code in \ref
			
 
				-VectorScalingUsingStarPUAPI).
			
 
				+implementations (see for example the code in
			
 
				+\ref VectorScalingUsingStarPUAPI).
			
 
				 
			
 
				 \fn void *starpu_data_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -1012,8 +1012,8 @@ designated by \p interface.
 
				 @name Defining Interface
			
 
				 \ingroup API_Data_Interfaces
			
 
				 
			
 
				-Applications can provide their own interface as shown in \ref
			
 
				-DefiningANewDataInterface.
			
 
				+Applications can provide their own interface as shown in
			
 
				+\ref DefiningANewDataInterface.
			
 
				 
			
 
				 \fn uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags)
			
 
				 \ingroup API_Data_Interfaces
			
--- a/doc/doxygen/chapters/api/data_management.doxy
+++ b/doc/doxygen/chapters/api/data_management.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -9,8 +9,8 @@
 
				 /*! \defgroup API_Data_Management Data Management
			
 
				 
			
 
				 \brief This section describes the data management facilities provided
			
 
				-by StarPU. We show how to use existing data interfaces in \ref
			
 
				-API_Data_Interfaces, but developers can design their own data interfaces if
			
 
				+by StarPU. We show how to use existing data interfaces in
			
 
				+\ref API_Data_Interfaces, but developers can design their own data interfaces if
			
 
				 required.
			
 
				 
			
 
				 \typedef starpu_data_handle_t
			
--- a/doc/doxygen/chapters/api/initialization.doxy
+++ b/doc/doxygen/chapters/api/initialization.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -69,8 +69,8 @@ provided by the hwloc library in case it is available.
 
				 If this flag is set, the CUDA workers will be attached to the CUDA
			
 
				 devices specified in the starpu_conf::workers_cuda_gpuid array.
			
 
				 Otherwise, StarPU affects the CUDA devices in a round-robin fashion.
			
 
				-This can also be specified with the environment variable \ref
			
 
				-STARPU_WORKERS_CUDAID. (default = 0)
			
 
				+This can also be specified with the environment variable
			
 
				+\ref STARPU_WORKERS_CUDAID. (default = 0)
			
 
				 \var unsigned starpu_conf::workers_cuda_gpuid[STARPU_NMAXWORKERS]
			
 
				 If the starpu_conf::use_explicit_workers_cuda_gpuid flag is set, this
			
 
				 array contains the logical identifiers of the CUDA devices (as used by
			
@@ -79,8 +79,8 @@ cudaGetDevice()).
 
				 If this flag is set, the OpenCL workers will be attached to the OpenCL
			
 
				 devices specified in the starpu_conf::workers_opencl_gpuid array.
			
 
				 Otherwise, StarPU affects the OpenCL devices in a round-robin fashion.
			
 
				-This can also be specified with the environment variable \ref
			
 
				-STARPU_WORKERS_OPENCLID. (default = 0)
			
 
				+This can also be specified with the environment variable
			
 
				+\ref STARPU_WORKERS_OPENCLID. (default = 0)
			
 
				 \var unsigned starpu_conf::workers_opencl_gpuid[STARPU_NMAXWORKERS]
			
 
				 If the starpu_conf::use_explicit_workers_opencl_gpuid flag is set,
			
 
				 this array contains the logical identifiers of the OpenCL devices to
			
@@ -89,8 +89,8 @@ be used.
 
				 If this flag is set, the MIC workers will be attached to the MIC
			
 
				 devices specified in the array starpu_conf::workers_mic_deviceid.
			
 
				 Otherwise, StarPU affects the MIC devices in a round-robin fashion.
			
 
				-This can also be specified with the environment variable \ref
			
 
				-STARPU_WORKERS_MICID.
			
 
				+This can also be specified with the environment variable
			
 
				+\ref STARPU_WORKERS_MICID.
			
 
				 (default = 0)
			
 
				 \var unsigned starpu_conf::workers_mic_deviceid[STARPU_NMAXWORKERS]
			
 
				 If the flag starpu_conf::use_explicit_workers_mic_deviceid is set, the
			
@@ -103,8 +103,8 @@ devices specified in the array starpu_conf::workers_scc_deviceid.
 
				 If the flag starpu_conf::use_explicit_workers_scc_deviceid is set, the
			
 
				 array contains the logical identifiers of the SCC devices to be used.
			
 
				 Otherwise, StarPU affects the SCC devices in a round-robin fashion.
			
 
				-This can also be specified with the environment variable \ref
			
 
				-STARPU_WORKERS_SCCID.
			
 
				+This can also be specified with the environment variable
			
 
				+\ref STARPU_WORKERS_SCCID.
			
 
				 
			
 
				 \var int starpu_conf::bus_calibrate
			
 
				 If this flag is set, StarPU will recalibrate the bus.  If this value
			
@@ -141,8 +141,8 @@ host program location.
 
				 \var int starpu_conf::disable_asynchronous_copy
			
 
				 This flag should be set to 1 to disable
			
 
				 asynchronous copies between CPUs and all accelerators. This
			
 
				-can also be specified with the environment variable \ref
			
 
				-STARPU_DISABLE_ASYNCHRONOUS_COPY. The
			
 
				+can also be specified with the environment variable
			
 
				+\ref STARPU_DISABLE_ASYNCHRONOUS_COPY. The
			
 
				 AMD implementation of OpenCL is known to fail when copying
			
 
				 data asynchronously. When using this implementation, it is
			
 
				 therefore necessary to disable asynchronous data transfers.
			
--- a/doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy
+++ b/doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012, 2013 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -152,8 +152,8 @@ workers are not allowed to be moved from the context.
 
				 This macro is used when calling sc_hypervisor_ctl() and must be
			
 
				 followed by 1 argument (int) that indicated the minimum number of
			
 
				 tasks that have to be executed before the context could be resized.
			
 
				-This parameter is ignored for the Application Driven strategy (see \ref 
			
 
				-ResizingStrategies) where the user indicates exactly when the resize
			
 
				+This parameter is ignored for the Application Driven strategy (see
			
 
				+\ref ResizingStrategies) where the user indicates exactly when the resize
			
 
				 should be done.
			
 
				 
			
 
				 \def SC_HYPERVISOR_NEW_WORKERS_MAX_IDLE
			
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -178,7 +178,7 @@ Return 1 if the worker belongs to the context and 0 otherwise
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Return the workerid if the worker belongs to the context and -1 otherwise.
			
 
				 If the thread calling this function is not a worker the function returns -1
			
 
				-as it calls the function \ref starpu_worker_get_id()
			
 
				+as it calls the function starpu_worker_get_id().
			
 
				 
			
 
				 \fn unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
			
 
				 \ingroup API_Scheduling_Contexts
			
--- a/doc/doxygen/chapters/api/standard_memory_library.doxy
+++ b/doc/doxygen/chapters/api/standard_memory_library.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -99,20 +99,20 @@ starpu_memory_pin(). Returns 0 on success, -1 on error.
 
				 
			
 
				 \fn ssize_t starpu_memory_get_total(unsigned node)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-If a memory limit is defined on the given node (see Section \ref
			
 
				-HowToLimitMemoryPerNode), return the amount of total memory
			
 
				+If a memory limit is defined on the given node (see Section
			
 
				+\ref HowToLimitMemoryPerNode), return the amount of total memory
			
 
				 on the node. Otherwise return -1.
			
 
				 
			
 
				 \fn ssize_t starpu_memory_get_available(unsigned node)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-If a memory limit is defined on the given node (see Section \ref
			
 
				-HowToLimitMemoryPerNode), return the amount of available memory
			
 
				+If a memory limit is defined on the given node (see Section
			
 
				+\ref HowToLimitMemoryPerNode), return the amount of available memory
			
 
				 on the node. Otherwise return -1.
			
 
				 
			
 
				 \fn int starpu_memory_allocate(unsigned node, size_t size, int flags)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-If a memory limit is defined on the given node (see Section \ref
			
 
				-HowToLimitMemoryPerNode), try to allocate some of it. This does not actually
			
 
				+If a memory limit is defined on the given node (see Section
			
 
				+\ref HowToLimitMemoryPerNode), try to allocate some of it. This does not actually
			
 
				 allocate memory, but only accounts for it. This can be useful when the
			
 
				 application allocates data another way, but want StarPU to be aware of the
			
 
				 allocation size e.g. for memory reclaiming.
			
@@ -122,8 +122,8 @@ STARPU_MEMORY_OVERFLOW to change this.
 
				 
			
 
				 \fn void starpu_memory_deallocate(unsigned node, size_t size)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-If a memory limit is defined on the given node (see Section \ref
			
 
				-HowToLimitMemoryPerNode), free some of it. This does not actually free memory,
			
 
				+If a memory limit is defined on the given node (see Section
			
 
				+\ref HowToLimitMemoryPerNode), free some of it. This does not actually free memory,
			
 
				 but only accounts for it, like starpu_memory_allocate(). The amount does not
			
 
				 have to be exactly the same as what was passed to starpu_memory_allocate(),
			
 
				 only the eventual amount needs to be the same, i.e. one call to
			
--- a/doc/doxygen/chapters/api/threads.doxy
+++ b/doc/doxygen/chapters/api/threads.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -193,8 +193,8 @@ function returns immediately. If the mutex is already locked by
 
				 another thread, the function suspends the calling thread until the
			
 
				 mutex is unlocked.
			
 
				 
			
 
				-This function also produces trace when the configure option \ref
			
 
				-enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				+This function also produces trace when the configure option
			
 
				+\ref enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				 
			
 
				 \fn int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
			
 
				 \ingroup API_Threads
			
@@ -202,8 +202,8 @@ This function unlocks the given mutex. The mutex is assumed to be
 
				 locked and owned by the calling thread on entrance to
			
 
				 starpu_pthread_mutex_unlock().
			
 
				 
			
 
				-This function also produces trace when the configure option \ref
			
 
				-enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				+This function also produces trace when the configure option
			
 
				+\ref enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				 
			
 
				 \fn int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
			
 
				 \ingroup API_Threads
			
@@ -213,8 +213,8 @@ already locked by another thread (or by the calling thread in the case
 
				 of a ``fast''  mutex). Instead, the function returns immediately with
			
 
				 the error code EBUSY.
			
 
				 
			
 
				-This function also produces trace when the configure option \ref
			
 
				-enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				+This function also produces trace when the configure option
			
 
				+\ref enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				 
			
 
				 \typedef STARPU_PTHREAD_MUTEX_INITIALIZER
			
 
				 \ingroup API_Threads
			
@@ -290,8 +290,8 @@ be locked by the calling thread on entrance to
 
				 starpu_pthread_cond_wait(). Before returning to the calling thread, the
			
 
				 function re-acquires mutex (as per starpu_pthread_mutex_lock()).
			
 
				 
			
 
				-This function also produces trace when the configure option \ref
			
 
				-enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				+This function also produces trace when the configure option
			
 
				+\ref enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				 
			
 
				 \fn starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime)
			
 
				 \ingroup API_Threads
			
--- a/doc/doxygen/doxygen-config.cfg.in
+++ b/doc/doxygen/doxygen-config.cfg.in
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009-2013  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				 # Copyright (C) 2011  Télécom-SudParis
			
 
				 # Copyright (C) 2011, 2012  INRIA
			
 
				 #
			
@@ -22,6 +22,7 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 
				                          @top_builddir@/doc/doxygen/starpu_config.h \
			
 
				 			 @top_srcdir@/include/starpu_bitmap.h \
			
 
				 	 		 @top_srcdir@/include/starpu_bound.h \
			
 
				+	 		 @top_srcdir@/include/starpu_clusters_util.h \
			
 
				 			 @top_srcdir@/include/starpu_cublas.h \
			
 
				 			 @top_srcdir@/include/starpu_cuda.h \
			
 
				 			 @top_srcdir@/include/starpu_data_filters.h \
			
@@ -35,15 +36,18 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 
				 			 @top_srcdir@/include/starpu.h \
			
 
				 			 @top_srcdir@/include/starpu_hash.h \
			
 
				 			 @top_srcdir@/include/starpu_mic.h \
			
 
				+			 @top_srcdir@/include/starpu_mod.f90 \
			
 
				 			 @top_srcdir@/include/starpu_opencl.h \
			
 
				 			 @top_srcdir@/include/starpu_openmp.h \
			
 
				 			 @top_srcdir@/include/starpu_perfmodel.h \
			
 
				 			 @top_srcdir@/include/starpu_profiling.h \
			
 
				 			 @top_srcdir@/include/starpu_rand.h \
			
 
				 			 @top_srcdir@/include/starpu_scc.h \
			
 
				+			 @top_srcdir@/include/starpu_sched_component.h \
			
 
				 			 @top_srcdir@/include/starpu_sched_ctx.h \
			
 
				 			 @top_srcdir@/include/starpu_sched_ctx_hypervisor.h \
			
 
				 			 @top_srcdir@/include/starpu_scheduler.h \
			
 
				+			 @top_srcdir@/include/starpu_simgrid_wrap.h \
			
 
				 			 @top_srcdir@/include/starpu_sink.h \
			
 
				 			 @top_srcdir@/include/starpu_stdlib.h \
			
 
				 			 @top_srcdir@/include/starpu_task_bundle.h \
			
@@ -56,7 +60,6 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 
				 			 @top_srcdir@/include/starpu_tree.h \
			
 
				 			 @top_srcdir@/include/starpu_util.h \
			
 
				 			 @top_srcdir@/include/starpu_worker.h \
			
 
				-			 @top_srcdir@/include/starpu_sched_component.h \
			
 
				 			 @top_srcdir@/include/fstarpu_mod.f90 \
			
 
				 			 @top_srcdir@/mpi/include/ \
			
 
				 			 @top_srcdir@/mpi/include/fstarpu_mpi_mod.f90 \
			
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -20,7 +20,7 @@ was last updated on \STARPUUPDATED.\\
 
				 
			
 
				 Copyright © 2009–2013 Université de Bordeaux\\
			
 
				 
			
 
				-Copyright © 2010-2015 CNRS
			
 
				+Copyright © 2010-2016 CNRS
			
 
				 
			
 
				 Copyright © 2011, 2012, 2016 INRIA
			
 
				 
			
@@ -174,6 +174,11 @@ Documentation License”.
 
				 \hypertarget{OpenMPRuntimeSupport}{}
			
 
				 \input{OpenMPRuntimeSupport}
			
 
				 
			
 
				+\chapter{Clustering a Machine}
			
 
				+\label{ClusteringAMachine}
			
 
				+\hypertarget{ClusteringAMachine}{}
			
 
				+\input{ClusteringAMachine}
			
 
				+
			
 
				 \part{StarPU Reference API}
			
 
				 
			
 
				 \chapter{Execution Configuration Through Environment Variables}
			
@@ -244,6 +249,7 @@ Documentation License”.
 
				 \input{starpu_8h}
			
 
				 \input{starpu__bitmap_8h}
			
 
				 \input{starpu__bound_8h}
			
 
				+\input{starpu__clusters__util_8h}
			
 
				 \input{starpu__config_8h}
			
 
				 \input{starpu__cublas_8h}
			
 
				 \input{starpu__cuda_8h}