8 years ago · be30756c0a
--- a/ChangeLog
+++ b/ChangeLog
@@ -25,6 +25,8 @@ New features:
 
				   * Add support for multiple linear regression performance models
			
 
				   * Add MPI Master-Slave support to use the cores of remote nodes. Use the
			
 
				     --enable-mpi-master-slave option to activate it.
			
 
				+  * Add STARPU_CUDA_THREAD_PER_DEV environment variable to support driving all
			
 
				+    GPUs from only one thread when almost all kernels are asynchronous.
			
 
				 
			
 
				 Small features:
			
 
				   * Scheduling contexts may now be associated a user data pointer at creation
			
@@ -35,6 +37,9 @@ Small features:
 
				   * New configure option --enable-mpi-pedantic-isend (disabled by
			
 
				     default) to acquire data in STARPU_RW (instead of STARPU_R) before
			
 
				     performing MPI_Isend call
			
 
				+  * New function starpu_worker_display_names to display the names of
			
 
				+    all the workers of a specified type.
			
 
				+  * Arbiters now support concurrent read access.
			
 
				 
			
 
				 Changes:
			
 
				   * Vastly improve simgrid simulation time.
			
@@ -55,6 +60,7 @@ New features:
 
				   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CUDA/OPENCL/MIC/SCC to share performance
			
 
				     models between devices, making calibration much faster.
			
 
				   * Add modular-heft-prio scheduler.
			
 
				+  * Add starpu_cublas_get_local_handle helper.
			
 
				 
			
 
				 Changes:
			
 
				   * Fix performance regression of lws for small tasks.
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -98,6 +98,7 @@ versinclude_HEADERS = 				\
 
				 	include/starpu_rand.h			\
			
 
				 	include/starpu_disk.h			\
			
 
				 	include/starpu_cublas.h			\
			
 
				+	include/starpu_cublas_v2.h		\
			
 
				 	include/starpu_driver.h			\
			
 
				 	include/starpu_stdlib.h			\
			
 
				 	include/starpu_thread.h			\
			
--- a/configure.ac
+++ b/configure.ac
@@ -163,7 +163,7 @@ if test x$enable_simgrid = xyes ; then
 
				 	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
			
 
				 	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
			
 
				 	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
			
 
				-   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data sg_link_name])
			
 
				+   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data sg_link_name sg_host_route])
			
 
				 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
			
 
				 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
			
 
				 	AC_CHECK_FUNCS([SIMIX_process_get_code], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE], [1], [Define to 1 if you have the `SIMIX_process_get_code' function.])])
			
@@ -663,6 +663,7 @@ fi
 
				 AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
			
 
				 AC_HAVE_LIBRARY([ws2_32])
			
 
				 AC_CHECK_FUNCS([sysconf])
			
 
				+AC_CHECK_FUNCS([getrlimit])
			
 
				 
			
 
				 AC_CHECK_FUNC([pthread_spin_lock], have_pthread_spin_lock=yes, have_pthread_spin_lock=no)
			
 
				 if test x$have_pthread_spin_lock = xyes; then
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -62,10 +62,11 @@ Unfortunately, some CUDA libraries do not have stream variants of
 
				 kernels. That will lower the potential for overlapping.
			
 
				 
			
 
				 Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
			
 
				-CUBLAS library. Some libraries like Magma may however change the current stream,
			
 
				+CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
			
 
				 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
			
 
				 the beginning of the codelet to make sure that CUBLAS is really using the proper
			
 
				-stream.
			
 
				+stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
			
 
				+kernels with the proper configuration.
			
 
				 
			
 
				 If the kernel can be made to only use this local stream or other self-allocated
			
 
				 streams, i.e. the whole kernel submission can be made asynchronous, then
			
--- a/doc/doxygen/chapters/390_faq.doxy
+++ b/doc/doxygen/chapters/390_faq.doxy
@@ -235,12 +235,19 @@ a task is finished, to feed the GPU with another task (StarPU actually submits
 
				 a couple of tasks in advance so as to pipeline this, but filling the pipeline
			
 
				 still has to be happening often enough), and thus it has to dedicate threads for
			
 
				 this, and this is a very CPU-consuming duty. StarPU thus dedicates one CPU core
			
 
				-for driving each GPU.
			
 
				+for driving each GPU by default.
			
 
				 
			
 
				 Such dedication is also useful when a codelet is hybrid, i.e. while kernels are
			
 
				 running on the GPU, the codelet can run some computation, which thus be run by
			
 
				 the CPU core instead of driving the GPU.
			
 
				 
			
 
				+One can choose to dedicate only one thread for all the CUDA devices by setting
			
 
				+the STARPU_CUDA_THREAD_PER_DEV environment variable to 1. The application
			
 
				+however should use STARPU_CUDA_ASYNC on its CUDA codelets (asynchronous
			
 
				+execution), otherwise the execution of a synchronous CUDA codelet will
			
 
				+monopolize the thread, and other CUDA devices will thus starve while it is
			
 
				+executing.
			
 
				+
			
 
				 \section CUDADrivers StarPU does not see my CUDA device
			
 
				 
			
 
				 First make sure that CUDA is properly running outside StarPU: build and
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -52,13 +52,23 @@ Specify the number of workers per CUDA device, and thus the number of kernels
 
				 which will be concurrently running on the devices. The default value is 1.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_NWORKER_PER_CUDA</dt>
			
 
				+<dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_CUDA_THREAD_PER_WORKER
			
 
				 \addindex __env__STARPU_CUDA_THREAD_PER_WORKER
			
 
				 Specify if the cuda driver should provide a thread per stream or a single thread 
			
 
				 dealing with all the streams. 0 if one thread per stream, 1 otherwise. The default 
			
 
				-value is 1.
			
 
				+value is 0. Setting it to 1 is contradictory with setting STARPU_CUDA_THREAD_PER_DEV to 1.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_CUDA_THREAD_PER_DEV</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_CUDA_THREAD_PER_DEV
			
 
				+\addindex __env__STARPU_CUDA_THREAD_PER_DEV
			
 
				+Specify if the cuda driver should provide a thread per device or a single thread 
			
 
				+dealing with all the devices. 0 if one thread per device, 1 otherwise. The default 
			
 
				+value is 1, unless STARPU_CUDA_THREAD_PER_WORKER is set to 1. Setting it to 1 is
			
 
				+contradictory with setting STARPU_CUDA_THREAD_PER_WORKER to 1.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_CUDA_PIPELINE</dt>
			
@@ -820,7 +830,7 @@ available to the application on the NUMA node with the OS identifier <c>devid</c
 
				 \addindex __env__STARPU_MINIMUM_AVAILABLE_MEM
			
 
				 This specifies the minimum percentage of memory that should be available in GPUs
			
 
				 (or in main memory, when using out of core), below which a reclaiming pass is
			
 
				-performed. The default is 5%.
			
 
				+performed. The default is 0%.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_TARGET_AVAILABLE_MEM</dt>
			
@@ -829,7 +839,7 @@ performed. The default is 5%.
 
				 \addindex __env__STARPU_TARGET_AVAILABLE_MEM
			
 
				 This specifies the target percentage of memory that should be reached in
			
 
				 GPUs (or in main memory, when using out of core), when performing a periodic
			
 
				-reclaiming pass. The default is 10%.
			
 
				+reclaiming pass. The default is 0%.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_MINIMUM_CLEAN_BUFFERS</dt>
			
--- a/doc/doxygen/chapters/api/bitmap.doxy
+++ b/doc/doxygen/chapters/api/bitmap.doxy
@@ -18,7 +18,7 @@ create a empty starpu_bitmap
 
				 
			
 
				 \fn void starpu_bitmap_destroy(struct starpu_bitmap *b)
			
 
				 \ingroup API_Bitmap
			
 
				-free a starpu_bitmap
			
 
				+free \b
			
 
				 
			
 
				 \fn void starpu_bitmap_set(struct starpu_bitmap *b, int e)
			
 
				 \ingroup API_Bitmap
			
@@ -38,7 +38,7 @@ return true iff bit \p e is set in \p b
 
				 
			
 
				 \fn void starpu_bitmap_unset_and(struct starpu_bitmap *a, struct starpu_bitmap *b, struct starpu_bitmap *c)
			
 
				 \ingroup API_Bitmap
			
 
				-Basically compute starpu_bitmap_unset_all(\p a) ; \p a = \p b & \p c;
			
 
				+Basically compute \c starpu_bitmap_unset_all(\p a) ; \p a = \p b & \p c;
			
 
				 
			
 
				 \fn void starpu_bitmap_or(struct starpu_bitmap *a, struct starpu_bitmap *b)
			
 
				 \ingroup API_Bitmap
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
--- a/doc/doxygen/chapters/api/cuda_extensions.doxy
+++ b/doc/doxygen/chapters/api/cuda_extensions.doxy
@@ -21,7 +21,7 @@ supported by StarPU.
 
				 
			
 
				 \fn cudaStream_t starpu_cuda_get_local_stream(void)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				-This function gets the current worker’s CUDA stream. StarPU
			
 
				+Return the current worker’s CUDA stream. StarPU
			
 
				 provides a stream for every CUDA device controlled by StarPU. This
			
 
				 function is only provided for convenience so that programmers can
			
 
				 easily use asynchronous operations within codelets without having to
			
@@ -33,8 +33,7 @@ overlapped.
 
				 
			
 
				 \fn const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				-This function returns a pointer to device properties for worker
			
 
				-\p workerid (assumed to be a CUDA worker).
			
 
				+Return a pointer to device properties for worker \p workerid (assumed to be a CUDA worker).
			
 
				 
			
 
				 \fn void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status)
			
 
				 \ingroup API_CUDA_Extensions
			
@@ -48,8 +47,8 @@ Calls starpu_cuda_report_error(), passing the current function, file and line po
 
				 \ingroup API_CUDA_Extensions
			
 
				 Copy \p ssize bytes from the pointer \p src_ptr on \p src_node
			
 
				 to the pointer \p dst_ptr on \p dst_node. The function first tries to
			
 
				-copy the data asynchronous (unless stream is <c>NULL</c>). If the
			
 
				-asynchronous copy fails or if stream is <c>NULL</c>, it copies the
			
 
				+copy the data asynchronous (unless \p stream is <c>NULL</c>). If the
			
 
				+asynchronous copy fails or if \p stream is <c>NULL</c>, it copies the
			
 
				 data synchronously. The function returns <c>-EAGAIN</c> if the
			
 
				 asynchronous launch was successfull. It returns 0 if the synchronous
			
 
				 copy was successful, or fails otherwise.
			
@@ -68,6 +67,20 @@ starpu_cublas_init() will initialize CUBLAS on every CUDA device
 
				 controlled by StarPU. This call blocks until CUBLAS has been properly
			
 
				 initialized on every device.
			
 
				 
			
 
				+\fn void starpu_cublas_set_stream(void)
			
 
				+\ingroup API_CUDA_Extensions
			
 
				+This function sets the proper CUBLAS stream for CUBLAS v1. This must be called from the CUDA
			
 
				+codelet before calling CUBLAS v1 kernels, so that they are queued on the proper
			
 
				+CUDA stream. When using one thread per CUDA worker, this function does not
			
 
				+do anything since the CUBLAS stream does not change, and is set once by
			
 
				+starpu_cublas_init().
			
 
				+
			
 
				+\fn cublasHandle_t starpu_cublas_get_local_handle(void)
			
 
				+\ingroup API_CUDA_Extensions
			
 
				+This function returns the CUBLAS v2 handle to be used to queue CUBLAS v2
			
 
				+kernels. It is properly initialized and configured for multistream by
			
 
				+starpu_cublas_init().
			
 
				+
			
 
				 \fn void starpu_cublas_shutdown(void)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 This function synchronously deinitializes the CUBLAS library on
			
--- a/doc/doxygen/chapters/api/data_interfaces.doxy
+++ b/doc/doxygen/chapters/api/data_interfaces.doxy
@@ -9,288 +9,303 @@
 
				 /*! \defgroup API_Data_Interfaces Data Interfaces
			
 
				 
			
 
				 \struct starpu_data_interface_ops
			
 
				-Per-interface data transfer methods.
			
 
				 \ingroup API_Data_Interfaces
			
 
				+Per-interface data transfer methods.
			
 
				 \var void (*starpu_data_interface_ops::register_data_handle)(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
			
 
				-Register an existing interface into a data handle.
			
 
				+    Register an existing interface into a data handle.
			
 
				 
			
 
				 \var starpu_ssize_t (*starpu_data_interface_ops::allocate_data_on_node)(void *data_interface, unsigned node)
			
 
				-Allocate data for the interface on a given node.
			
 
				+    Allocate data for the interface on a given node.
			
 
				 
			
 
				 \var void (*starpu_data_interface_ops::free_data_on_node)(void *data_interface, unsigned node)
			
 
				-Free data of the interface on a given node.
			
 
				+    Free data of the interface on a given node.
			
 
				 
			
 
				 \var const struct starpu_data_copy_methods *starpu_data_interface_ops::copy_methods
			
 
				-ram/cuda/opencl synchronous and asynchronous transfer methods.
			
 
				+    ram/cuda/opencl synchronous and asynchronous transfer methods.
			
 
				 
			
 
				 \var void *(*starpu_data_interface_ops::handle_to_pointer)(starpu_data_handle_t handle, unsigned node)
			
 
				-Return the current pointer (if any) for the handle on the given node.
			
 
				+    Return the current pointer (if any) for the handle on the given node.
			
 
				 
			
 
				 \var size_t (*starpu_data_interface_ops::get_size)(starpu_data_handle_t handle)
			
 
				-Return an estimation of the size of data, for performance models.
			
 
				+    Return an estimation of the size of data, for performance models.
			
 
				 
			
 
				 \var uint32_t (*starpu_data_interface_ops::footprint)(starpu_data_handle_t handle)
			
 
				-Return a 32bit footprint which characterizes the data size.
			
 
				+    Return a 32bit footprint which characterizes the data size.
			
 
				 
			
 
				 \var int (*starpu_data_interface_ops::compare)(void *data_interface_a, void *data_interface_b)
			
 
				-Compare the data size of two interfaces.
			
 
				+    Compare the data size of two interfaces.
			
 
				 
			
 
				 \var void (*starpu_data_interface_ops::display)(starpu_data_handle_t handle, FILE *f)
			
 
				-Dump the sizes of a handle to a file.
			
 
				+    Dump the sizes of a handle to a file.
			
 
				 
			
 
				 \var starpu_ssize_t (*starpu_data_interface_ops::describe)(void *data_interface, char *buf, size_t size)
			
 
				-Describe the data into a string.
			
 
				+    Describe the data into a string.
			
 
				 
			
 
				 \var enum starpu_data_interface_id starpu_data_interface_ops::interfaceid
			
 
				-An identifier that is unique to each interface.
			
 
				+    An identifier that is unique to each interface.
			
 
				 
			
 
				 \var size_t starpu_data_interface_ops::interface_size
			
 
				-The size of the interface data descriptor.
			
 
				+    The size of the interface data descriptor.
			
 
				 
			
 
				 \var char starpu_data_interface_ops::is_multiformat
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \var char starpu_data_interface_ops::dontcache
			
 
				-If set to non-zero, StarPU will never try to reuse an allocated buffer for a
			
 
				-different handle. This can be notably useful for application-defined interfaces
			
 
				-which have a dynamic size, and for which it thus does not make sense to reuse
			
 
				-the buffer since will probably not have the proper size.
			
 
				+    If set to non-zero, StarPU will never try to reuse an allocated
			
 
				+    buffer for a  different handle. This can be notably useful for
			
 
				+    application-defined interfaces which have a dynamic size, and for
			
 
				+    which it thus does not make sense to reuse the buffer since will
			
 
				+    probably not have the proper size.
			
 
				 
			
 
				 \var struct starpu_multiformat_data_interface_ops* (*starpu_data_interface_ops::get_mf_ops)(void *data_interface)
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \var int (*starpu_data_interface_ops::pack_data)(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
			
 
				-Pack the data handle into a contiguous buffer at the address allocated with
			
 
				-<c>starpu_malloc_flags(ptr, size, 0)</c> (and thus returned in \p ptr) and
			
 
				-set the size of the newly created buffer in \p count. If \p ptr is <c>NULL</c>, the
			
 
				-function should not copy the data in the buffer but just set count to
			
 
				-the size of the buffer which would have been allocated. The special
			
 
				-value -1 indicates the size is yet unknown.
			
 
				+    Pack the data handle into a contiguous buffer at the address
			
 
				+    allocated with <c>starpu_malloc_flags(ptr, size, 0)</c> (and thus
			
 
				+    returned in \p ptr) and set the size of the newly created buffer
			
 
				+    in \p count. If \p ptr is <c>NULL</c>, the function should not
			
 
				+    copy the data in the buffer but just set count to the size of the
			
 
				+    buffer which would have been allocated. The special value -1
			
 
				+    indicates the size is yet unknown.
			
 
				 
			
 
				 \var int (*starpu_data_interface_ops::unpack_data) (starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
			
 
				-Unpack the data handle from the contiguous buffer at the address \p ptr
			
 
				-of size \p count
			
 
				+    Unpack the data handle from the contiguous buffer at the address
			
 
				+    \p ptr of size \p count
			
 
				 
			
 
				 \struct starpu_data_copy_methods
			
 
				-Defines the per-interface methods. If the any_to_any method is
			
 
				-provided, it will be used by default if no more specific method is
			
 
				-provided. It can still be useful to provide more specific method in
			
 
				-case of e.g. available particular CUDA or OpenCL support.
			
 
				 \ingroup API_Data_Interfaces
			
 
				+Defines the per-interface methods. If the
			
 
				+starpu_data_copy_methods::any_to_any method is provided, it will be
			
 
				+used by default if no specific method is provided. It can still be
			
 
				+useful to provide more specific method in case of e.g. available
			
 
				+particular CUDA or OpenCL support.
			
 
				 \var int (*starpu_data_copy_methods::can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node)
			
 
				-If defined, allows the interface to declare whether it supports transferring
			
 
				-from \p src_interface on node \p src_node to \p dst_interface on node \p
			
 
				-dst_node, run from node \p handling_node. If not defined, it is assumed that the
			
 
				-interface supports all transfers.
			
 
				+    If defined, allows the interface to declare whether it supports
			
 
				+    transferring from \p src_interface on node \p src_node to \p
			
 
				+    dst_interface on node \p dst_node, run from node \p handling_node.
			
 
				+    If not defined, it is assumed that the interface supports all
			
 
				+    transfers.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the \p
			
 
				-src_node CPU node to the \p dst_interface interface on the \p dst_node
			
 
				-CPU  node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node CPU node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CPU node to the \p dst_interface interface on the \p dst_node CUDA
			
 
				-node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node CUDA node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CPU node to the \p dst_interface interface on the \p dst_node
			
 
				-OpenCL node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node OpenCL node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_mic)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CPU node to the \p dst_interface interface on the \p dst_node MIC
			
 
				-node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node MIC node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::cuda_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CUDA node to the \p dst_interface interface on the \p dst_node
			
 
				-CPU node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CUDA node to the \p dst_interface interface on the \p
			
 
				+    dst_node CPU node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::cuda_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CUDA node to the \p dst_interface interface on the \p dst_node CUDA
			
 
				-node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CUDA node to the \p dst_interface interface on the \p
			
 
				+    dst_node CUDA node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::cuda_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CUDA node to the \p dst_interface interface on the \p dst_node
			
 
				-OpenCL node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CUDA node to the \p dst_interface interface on the \p
			
 
				+    dst_node OpenCL node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::opencl_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node OpenCL node to the \p dst_interface interface on the \p dst_node
			
 
				-CPU node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node OpenCL node to the \p dst_interface interface on the
			
 
				+    \p dst_node CPU node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::opencl_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node OpenCL node to the \p dst_interface interface on the \p dst_node
			
 
				-CUDA node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node OpenCL node to the \p dst_interface interface on the
			
 
				+    \p dst_node CUDA node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::opencl_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node OpenCL node to the \p dst_interface interface on the \p dst_node
			
 
				-OpenCL node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node OpenCL node to the \p dst_interface interface on the
			
 
				+    \p dst_node OpenCL node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::mic_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node MIC node to the \p dst_interface interface on the \p dst_node CPU
			
 
				-node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node MIC node to the \p dst_interface interface on the \p
			
 
				+    dst_node CPU node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::scc_src_to_sink)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node node to the \p dst_interface interface on the \p dst_node node.
			
 
				-Must return 0 if the transfer was actually completed completely
			
 
				-synchronously, or <c>-EAGAIN</c> if at least some transfers are still ongoing
			
 
				-and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node node to the \p dst_interface interface on the \p
			
 
				+    dst_node node. Must return 0 if the transfer was actually
			
 
				+    completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				+    some transfers are still ongoing and should be awaited for by the
			
 
				+    core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::scc_sink_to_src)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node node to the \p dst_interface interface on the \p dst_node node.
			
 
				-Must return 0 if the transfer was actually completed completely
			
 
				-synchronously, or <c>-EAGAIN</c> if at least some transfers are still ongoing
			
 
				-and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node node to the \p dst_interface interface on the \p
			
 
				+    dst_node node. Must return 0 if the transfer was actually
			
 
				+    completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				+    some transfers are still ongoing and should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::scc_sink_to_sink)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node node to the \p dst_interface interface on the \p dst_node node.
			
 
				-Must return 0 if the transfer was actually completed completely
			
 
				-synchronously, or <c>-EAGAIN</c> if at least some transfers are still ongoing
			
 
				-and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node node to the \p dst_interface interface on the \p
			
 
				+    dst_node node. Must return 0 if the transfer was actually
			
 
				+    completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				+    some transfers are still ongoing and should be awaited for by the
			
 
				+    core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_mpi_ms)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CPU node to the \p dst_interface interface on the \p dst_node MPI Slave
			
 
				-node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node MPI Slave node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::mpi_ms_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node MPI Slave node to the \p dst_interface interface on the \p dst_node CPU
			
 
				-node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node MPI Slave node to the \p dst_interface interface on
			
 
				+    the \p dst_node CPU node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::mpi_ms_to_mpi_ms)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node MPI Slave node to the \p dst_interface interface on the \p dst_node
			
 
				-MPI Slave node. Return 0 on success.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node MPI Slave node to the \p dst_interface interface on
			
 
				+    the \p dst_node MPI Slave node. Return 0 on success.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CPU node to the \p dst_interface interface on the \p dst_node CUDA
			
 
				-node, using the given stream. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node CUDA node, using the given stream. Must return 0 if the
			
 
				+    transfer was actually completed completely synchronously, or
			
 
				+    <c>-EAGAIN</c> if at least some transfers are still ongoing and
			
 
				+    should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CUDA node to the \p dst_interface interface on the \p dst_node CPU
			
 
				-node, using the given stream. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CUDA node to the \p dst_interface interface on the \p
			
 
				+    dst_node CPU node, using the given stream. Must return 0 if the
			
 
				+    transfer was actually completed completely synchronously, or
			
 
				+    <c>-EAGAIN</c> if at least some transfers are still ongoing and
			
 
				+    should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CUDA node to the \p dst_interface interface on the \p dst_node CUDA
			
 
				-node, using the given stream. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CUDA node to the \p dst_interface interface on the \p
			
 
				+    dst_node CUDA node, using the given stream. Must return 0 if the
			
 
				+    transfer was actually completed completely synchronously, or
			
 
				+    <c>-EAGAIN</c> if at least some transfers are still ongoing and
			
 
				+    should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CPU node to the \p dst_interface interface on the \p dst_node
			
 
				-OpenCL node, by recording in \p event, a pointer to a <c>cl_event</c>, the event
			
 
				-of the last submitted transfer. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the
			
 
				-core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node OpenCL node, by recording in \p event, a pointer to a
			
 
				+    <c>cl_event</c>, the event of the last submitted transfer. Must
			
 
				+    return 0 if the transfer was actually completed completely
			
 
				+    synchronously, or <c>-EAGAIN</c> if at least some transfers are
			
 
				+    still ongoing and should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node OpenCL node to the \p dst_interface interface on the \p dst_node
			
 
				-CPU node, by recording in \p event, a pointer to a <c>cl_event</c>, the event of
			
 
				-the last submitted transfer. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the
			
 
				-core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node OpenCL node to the \p dst_interface interface on the
			
 
				+    \p dst_node CPU node, by recording in \p event, a pointer to a
			
 
				+    <c>cl_event</c>, the event of the last submitted transfer. Must
			
 
				+    return 0 if the transfer was actually completed completely
			
 
				+    synchronously, or <c>-EAGAIN</c> if at least some transfers are
			
 
				+    still ongoing and should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node OpenCL node to the \p dst_interface interface on the \p dst_node
			
 
				-OpenCL node, by recording in \p event, a pointer to a <c>cl_event</c>, the event
			
 
				-of the last submitted transfer. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the
			
 
				-core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node OpenCL node to the \p dst_interface interface on the
			
 
				+    \p dst_node OpenCL node, by recording in \p event, a pointer to a
			
 
				+    <c>cl_event</c>, the event of the last submitted transfer. Must
			
 
				+    return 0 if the transfer was actually completed completely
			
 
				+    synchronously, or <c>-EAGAIN</c> if at least some transfers are
			
 
				+    still ongoing and should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_mpi_ms_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CPU node to the \p dst_interface interface on the \p dst_node MPI Slave
			
 
				-node, with the given even. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node MPI Slave node, with the given even. Must return 0 if the
			
 
				+    transfer was actually completed completely synchronously, or
			
 
				+    <c>-EAGAIN</c> if at least some transfers are still ongoing and
			
 
				+    should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::mpi_ms_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node MPI Slave node to the \p dst_interface interface on the \p dst_node CPU
			
 
				-node, with the given event. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node MPI Slave node to the \p dst_interface interface on
			
 
				+    the \p dst_node CPU node, with the given event. Must return 0 if
			
 
				+    the transfer was actually completed completely synchronously, or
			
 
				+    <c>-EAGAIN</c> if at least some transfers are still ongoing and
			
 
				+    should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::mpi_ms_to_mpi_ms_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node MPI Slave node to the \p dst_interface interface on the \p dst_node MPI Slave 
			
 
				-node, using the given stream. Must return 0 if the transfer was
			
 
				-actually completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				-some transfers are still ongoing and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node MPI Slave node to the \p dst_interface interface on
			
 
				+    the \p dst_node MPI Slave node, using the given stream. Must
			
 
				+    return 0 if the transfer was actually completed completely
			
 
				+    synchronously, or <c>-EAGAIN</c> if at least some transfers are
			
 
				+    still ongoing and should be awaited for by the core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::ram_to_mic_async)(void *src_intreface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node CPU node to the \p dst_interface interface on the \p dst_node
			
 
				-MIC node. Must return 0 if the transfer was actually completed
			
 
				-completely synchronously, or <c>-EAGAIN</c> if at least some transfers are
			
 
				-still ongoing and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node CPU node to the \p dst_interface interface on the \p
			
 
				+    dst_node MIC node. Must return 0 if the transfer was actually
			
 
				+    completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				+    some transfers are still ongoing and should be awaited for by the
			
 
				+    core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::mic_to_ram_async)(void *src_intreface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node MIC node to the \p dst_interface interface on the \p dst_node
			
 
				-CPU node. Must return 0 if the transfer was actually completed
			
 
				-completely synchronously, or <c>-EAGAIN</c> if at least some transfers are
			
 
				-still ongoing and should be awaited for by the core.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node MIC node to the \p dst_interface interface on the \p
			
 
				+    dst_node CPU node. Must return 0 if the transfer was actually
			
 
				+    completed completely synchronously, or <c>-EAGAIN</c> if at least
			
 
				+    some transfers are still ongoing and should be awaited for by the
			
 
				+    core.
			
 
				 
			
 
				 \var int (*starpu_data_copy_methods::any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				-Define how to copy data from the \p src_interface interface on the
			
 
				-\p src_node node to the \p dst_interface interface on the \p dst_node node.
			
 
				-This is meant to be implemented through the starpu_interface_copy()
			
 
				-helper, to which async_data should be passed as such, and will be used
			
 
				-to manage asynchronicity. This must return <c>-EAGAIN</c> if any of the
			
 
				-starpu_interface_copy() calls has returned <c>-EAGAIN</c> (i.e. at least some
			
 
				-transfer is still ongoing), and return 0 otherwise.
			
 
				+    Define how to copy data from the \p src_interface interface on the
			
 
				+    \p src_node node to the \p dst_interface interface on the \p
			
 
				+    dst_node node. This is meant to be implemented through the
			
 
				+    starpu_interface_copy() helper, to which async_data should be
			
 
				+    passed as such, and will be used to manage asynchronicity. This
			
 
				+    must return <c>-EAGAIN</c> if any of the starpu_interface_copy()
			
 
				+    calls has returned <c>-EAGAIN</c> (i.e. at least some transfer is
			
 
				+    still ongoing), and return 0 otherwise.
			
 
				 
			
 
				 \enum starpu_data_interface_id
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Identifier for all predefined StarPU data interfaces
			
 
				 \var starpu_data_interface_id::STARPU_UNKNOWN_INTERFACE_ID
			
 
				-Unknown interface
			
 
				+    Unknown interface
			
 
				 \var starpu_data_interface_id::STARPU_MATRIX_INTERFACE_ID
			
 
				-Identifier for the matrix data interface
			
 
				+    Identifier for the matrix data interface
			
 
				 \var starpu_data_interface_id::STARPU_BLOCK_INTERFACE_ID
			
 
				-Identifier for block data interface
			
 
				+    Identifier for block data interface
			
 
				 \var starpu_data_interface_id::STARPU_VECTOR_INTERFACE_ID
			
 
				-Identifier for the vector data interface
			
 
				+    Identifier for the vector data interface
			
 
				 \var starpu_data_interface_id::STARPU_CSR_INTERFACE_ID
			
 
				-Identifier for the csr data interface
			
 
				+    Identifier for the csr data interface
			
 
				 \var starpu_data_interface_id::STARPU_BCSR_INTERFACE_ID
			
 
				-Identifier for the bcsr data interface
			
 
				+    Identifier for the bcsr data interface
			
 
				 \var starpu_data_interface_id::STARPU_VARIABLE_INTERFACE_ID
			
 
				-Identifier for the variable data interface
			
 
				+    Identifier for the variable data interface
			
 
				 \var starpu_data_interface_id::STARPU_VOID_INTERFACE_ID
			
 
				-Identifier for the void data interface
			
 
				+    Identifier for the void data interface
			
 
				 \var starpu_data_interface_id::STARPU_MULTIFORMAT_INTERFACE_ID
			
 
				-Identifier for the multiformat data interface
			
 
				+    Identifier for the multiformat data interface
			
 
				 \var starpu_data_interface_id::STARPU_COO_INTERFACE_ID
			
 
				-Identifier for the coo data interface
			
 
				+    Identifier for the coo data interface
			
 
				 \var starpu_data_interface_id::STARPU_MAX_INTERFACE_ID
			
 
				-Maximum number of data interfaces
			
 
				+    Maximum number of data interfaces
			
 
				 
			
 
				 @name Registering Data
			
 
				 \ingroup API_Data_Interfaces
			
@@ -306,8 +321,8 @@ Register a void interface. There is no data really associated
 
				 to that interface, but it may be used as a synchronization mechanism.
			
 
				 It also permits to express an abstract piece of data that is managed
			
 
				 by the application internally: this makes it possible to forbid the
			
 
				-concurrent execution of different tasks accessing the same <c>void</c> data
			
 
				-in read-write concurrently. 
			
 
				+concurrent execution of different tasks accessing the same <c>void</c>
			
 
				+data in read-write concurrently.
			
 
				 
			
 
				 \fn void starpu_variable_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, size_t size)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -329,7 +344,7 @@ buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
 
				 
			
 
				 \fn void starpu_vector_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, uint32_t nx, size_t elemsize)
			
 
				 \ingroup API_Data_Interfaces
			
 
				-Register the \p nx elemsize-byte elements pointed to by \p ptr and initialize \p handle to represent it.
			
 
				+Register the \p nx \p elemsize-byte elements pointed to by \p ptr and initialize \p handle to represent it.
			
 
				 
			
 
				 Here an example of how to use the function.
			
 
				 \code{.c}
			
@@ -395,7 +410,7 @@ Blocks have size \p r * \p c. \p nrow is the number of rows (in terms of
 
				 blocks), \p colind[i] is the block-column index for block i in \p nzval,
			
 
				 \p rowptr[i] is the block-index (in \p nzval) of the first block of row i.
			
 
				 \p firstentry is the index of the first entry of the given arrays
			
 
				-(usually 0 or 1). 
			
 
				+(usually 0 or 1).
			
 
				 
			
 
				 \fn void starpu_csr_data_register(starpu_data_handle_t *handle, int home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -429,7 +444,7 @@ if handle’s interface does not support this operation or data for this
 
				 \fn void *starpu_data_get_local_ptr(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the local pointer associated with \p handle or <c>NULL</c> if
			
 
				-\p handle’s interface does not have data allocated locally 
			
 
				+\p handle’s interface does not have any data allocated locally.
			
 
				 
			
 
				 \fn enum starpu_data_interface_id starpu_data_get_interface_id(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -463,18 +478,18 @@ after calling the data unpacking operation.
 
				 \ingroup API_Data_Interfaces
			
 
				 
			
 
				 \struct starpu_variable_interface
			
 
				-Variable interface for a single data (not a vector, a matrix, a list, ...)
			
 
				 \ingroup API_Data_Interfaces
			
 
				+Variable interface for a single data (not a vector, a matrix, a list, ...)
			
 
				 \var enum starpu_data_interface_id starpu_variable_interface::id
			
 
				-Identifier of the interface
			
 
				+    Identifier of the interface
			
 
				 \var uintptr_t starpu_variable_interface::ptr
			
 
				-local pointer of the variable
			
 
				+    local pointer of the variable
			
 
				 \var uintptr_t starpu_variable_interface::dev_handle
			
 
				-device handle of the variable.
			
 
				+    device handle of the variable.
			
 
				 \var size_t starpu_variable_interface::offset
			
 
				-offset in the variable
			
 
				+    offset in the variable
			
 
				 \var size_t starpu_variable_interface::elemsize
			
 
				-size of the variable
			
 
				+    size of the variable
			
 
				 
			
 
				 \fn size_t starpu_variable_get_elemsize(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -495,7 +510,7 @@ Return the size of the variable designated by \p interface.
 
				 \def STARPU_VARIABLE_GET_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the variable designated by
			
 
				-\p interface, to be used on OpenCL. The offset documented below has to be
			
 
				+\p interface, to be used with OpenCL. The offset documented below has to be
			
 
				 used in addition to this.
			
 
				 
			
 
				 \def STARPU_VARIABLE_GET_OFFSET(interface)
			
@@ -510,19 +525,19 @@ be used with the device handle.
 
				 Vector interface
			
 
				 \ingroup API_Data_Interfaces
			
 
				 \var enum starpu_data_interface_id starpu_vector_interface::id
			
 
				-Identifier of the interface
			
 
				+    Identifier of the interface
			
 
				 \var uintptr_t starpu_vector_interface::ptr
			
 
				-local pointer of the vector
			
 
				+    local pointer of the vector
			
 
				 \var uintptr_t starpu_vector_interface::dev_handle
			
 
				-device handle of the vector.
			
 
				+    device handle of the vector.
			
 
				 \var size_t starpu_vector_interface::offset
			
 
				-offset in the vector
			
 
				+    offset in the vector
			
 
				 \var uint32_t starpu_vector_interface::nx
			
 
				-number of elements on the x-axis of the vector
			
 
				+    number of elements on the x-axis of the vector
			
 
				 \var size_t starpu_vector_interface::elemsize
			
 
				-size of the elements of the vector
			
 
				+    size of the elements of the vector
			
 
				 \var uint32_t starpu_vector_interface::slice_base
			
 
				-vector slice base, used by the StarPU OpenMP runtime support
			
 
				+    vector slice base, used by the StarPU OpenMP runtime support
			
 
				 
			
 
				 \fn uint32_t starpu_vector_get_nx(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -545,7 +560,7 @@ be used instead.
 
				 \def STARPU_VECTOR_GET_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the array designated by \p interface,
			
 
				-to be used on OpenCL. the offset documented below has to be used in
			
 
				+to be used with OpenCL. the offset documented below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				 \def STARPU_VECTOR_GET_OFFSET(interface)
			
@@ -577,22 +592,22 @@ Return the OpenMP slice base annotation of each element of the array designated
 
				 Matrix interface for dense matrices
			
 
				 \ingroup API_Data_Interfaces
			
 
				 \var enum starpu_data_interface_id starpu_matrix_interface::id
			
 
				-Identifier of the interface
			
 
				+    Identifier of the interface
			
 
				 \var uintptr_t starpu_matrix_interface::ptr
			
 
				-local pointer of the matrix
			
 
				+    local pointer of the matrix
			
 
				 \var uintptr_t starpu_matrix_interface::dev_handle
			
 
				-device handle of the matrix.
			
 
				+    device handle of the matrix.
			
 
				 \var size_t starpu_matrix_interface::offset
			
 
				-offset in the matrix
			
 
				+    offset in the matrix
			
 
				 \var uint32_t starpu_matrix_interface::nx
			
 
				-number of elements on the x-axis of the matrix
			
 
				+    number of elements on the x-axis of the matrix
			
 
				 \var uint32_t starpu_matrix_interface::ny
			
 
				-number of elements on the y-axis of the matrix
			
 
				+    number of elements on the y-axis of the matrix
			
 
				 \var uint32_t starpu_matrix_interface::ld
			
 
				-number of elements between each row of the matrix. Maybe be equal to
			
 
				-starpu_matrix_interface::nx when there is no padding.
			
 
				+    number of elements between each row of the matrix. Maybe be equal
			
 
				+    to starpu_matrix_interface::nx when there is no padding.
			
 
				 \var size_t starpu_matrix_interface::elemsize
			
 
				-size of the elements of the matrix
			
 
				+    size of the elements of the matrix
			
 
				 
			
 
				 \fn uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -627,7 +642,7 @@ and offset need to be used instead.
 
				 \def STARPU_MATRIX_GET_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the matrix designated by \p interface,
			
 
				-to be used on OpenCL. The offset documented below has to be used in
			
 
				+to be used with OpenCL. The offset documented below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				 \def STARPU_MATRIX_GET_OFFSET(interface)
			
@@ -662,25 +677,25 @@ designated by \p interface.
 
				 Block interface for 3D dense blocks
			
 
				 \ingroup API_Data_Interfaces
			
 
				 \var enum starpu_data_interface_id starpu_block_interface::id
			
 
				-identifier of the interface
			
 
				+    identifier of the interface
			
 
				 \var uintptr_t starpu_block_interface::ptr
			
 
				-local pointer of the block
			
 
				+    local pointer of the block
			
 
				 \var uintptr_t starpu_block_interface::dev_handle
			
 
				-device handle of the block.
			
 
				+    device handle of the block.
			
 
				 \var size_t starpu_block_interface::offset
			
 
				-offset in the block.
			
 
				+    offset in the block.
			
 
				 \var uint32_t starpu_block_interface::nx
			
 
				-number of elements on the x-axis of the block.
			
 
				+    number of elements on the x-axis of the block.
			
 
				 \var uint32_t starpu_block_interface::ny
			
 
				-number of elements on the y-axis of the block.
			
 
				+    number of elements on the y-axis of the block.
			
 
				 \var uint32_t starpu_block_interface::nz
			
 
				-number of elements on the z-axis of the block.
			
 
				+    number of elements on the z-axis of the block.
			
 
				 \var uint32_t starpu_block_interface::ldy
			
 
				-number of elements between two lines
			
 
				+    number of elements between two lines
			
 
				 \var uint32_t starpu_block_interface::ldz
			
 
				-number of elements between two planes
			
 
				+    number of elements between two planes
			
 
				 \var size_t starpu_block_interface::elemsize
			
 
				-size of the elements of the block.
			
 
				+    size of the elements of the block.
			
 
				 
			
 
				 \fn uint32_t starpu_block_get_nx(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -770,25 +785,25 @@ BCSR interface for sparse matrices (blocked compressed sparse
 
				 row representation)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 \var enum starpu_data_interface_id starpu_bcsr_interface::id
			
 
				-Identifier of the interface
			
 
				+    Identifier of the interface
			
 
				 \var uint32_t starpu_bcsr_interface::nnz
			
 
				-number of non-zero BLOCKS
			
 
				+    number of non-zero BLOCKS
			
 
				 \var uint32_t starpu_bcsr_interface::nrow
			
 
				-number of rows (in terms of BLOCKS)
			
 
				+    number of rows (in terms of BLOCKS)
			
 
				 \var uintptr_t starpu_bcsr_interface::nzval
			
 
				-non-zero values
			
 
				+    non-zero values
			
 
				 \var uint32_t *starpu_bcsr_interface::colind
			
 
				-position of non-zero entried on the row
			
 
				+    position of non-zero entried on the row
			
 
				 \var uint32_t *starpu_bcsr_interface::rowptr
			
 
				-index (in nzval) of the first entry of the row
			
 
				+    index (in nzval) of the first entry of the row
			
 
				 \var starpu_bcsr_interface::firstentry
			
 
				-k for k-based indexing (0 or 1 usually). Also useful when partitionning the matrix.
			
 
				+    k for k-based indexing (0 or 1 usually). Also useful when partitionning the matrix.
			
 
				 \var uint32_t starpu_bcsr_interface::r
			
 
				-size of the blocks
			
 
				+    size of the blocks
			
 
				 \var uint32_t starpu_bcsr_interface::c
			
 
				-size of the blocks
			
 
				+    size of the blocks
			
 
				 \var size_t starpu_bcsr_interface::elemsize;
			
 
				-size of the elements of the matrix
			
 
				+    size of the elements of the matrix
			
 
				 
			
 
				 \fn uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -883,21 +898,21 @@ matrix designated by \p interface, to be used with the device handles.
 
				 CSR interface for sparse matrices (compressed sparse row representation)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 \var enum starpu_data_interface_id starpu_csr_interface::id
			
 
				-Identifier of the interface
			
 
				+    Identifier of the interface
			
 
				 \var uint32_t starpu_csr_interface::nnz
			
 
				-number of non-zero entries
			
 
				+    number of non-zero entries
			
 
				 \var uint32_t starpu_csr_interface::nrow
			
 
				-number of rows
			
 
				+    number of rows
			
 
				 \var uintptr_t starpu_csr_interface::nzval
			
 
				-non-zero values
			
 
				+    non-zero values
			
 
				 \var uint32_t *starpu_csr_interface::colind
			
 
				-position of non-zero entries on the row
			
 
				+    position of non-zero entries on the row
			
 
				 \var uint32_t *starpu_csr_interface::rowptr
			
 
				-index (in nzval) of the first entry of the row
			
 
				+    index (in nzval) of the first entry of the row
			
 
				 \var uint32_t starpu_csr_interface::firstentry
			
 
				-k for k-based indexing (0 or 1 usually). also useful when partitionning the matrix.
			
 
				+    k for k-based indexing (0 or 1 usually). also useful when partitionning the matrix.
			
 
				 \var size_t starpu_csr_interface::elemsize
			
 
				-size of the elements of the matrix
			
 
				+    size of the elements of the matrix
			
 
				 
			
 
				 \fn uint32_t starpu_csr_get_nnz(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -999,21 +1014,21 @@ designated by \p interface.
 
				 COO Matrices
			
 
				 \ingroup API_Data_Interfaces
			
 
				 \var enum starpu_data_interface_id starpu_coo_interface::id
			
 
				-identifier of the interface
			
 
				+    identifier of the interface
			
 
				 \var uint32_t  *starpu_coo_interface::columns
			
 
				-column array of the matrix
			
 
				+    column array of the matrix
			
 
				 \var uint32_t  *starpu_coo_interface::rows
			
 
				-row array of the matrix
			
 
				+    row array of the matrix
			
 
				 \var uintptr_t starpu_coo_interface::values
			
 
				-values of the matrix
			
 
				+    values of the matrix
			
 
				 \var uint32_t  starpu_coo_interface::nx
			
 
				-number of elements on the x-axis of the matrix
			
 
				+    number of elements on the x-axis of the matrix
			
 
				 \var uint32_t  starpu_coo_interface::ny
			
 
				-number of elements on the y-axis of the matrix
			
 
				+    number of elements on the y-axis of the matrix
			
 
				 \var uint32_t  starpu_coo_interface::n_values
			
 
				-number of values registered in the matrix
			
 
				+    number of values registered in the matrix
			
 
				 \var size_t starpu_coo_interface::elemsize
			
 
				-size of the elements of the matrix
			
 
				+    size of the elements of the matrix
			
 
				 
			
 
				 \def STARPU_COO_GET_COLUMNS(interface)
			
 
				 \ingroup API_Data_Interfaces
			
@@ -1023,7 +1038,7 @@ by \p interface.
 
				 \def STARPU_COO_GET_COLUMNS_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the column array of the matrix
			
 
				-designated by \p interface, to be used on OpenCL. The offset documented
			
 
				+designated by \p interface, to be used with OpenCL. The offset documented
			
 
				 below has to be used in addition to this.
			
 
				 
			
 
				 \def STARPU_COO_GET_ROWS(interface)
			
@@ -1103,7 +1118,7 @@ with starpu_malloc_on_node().
 
				 
			
 
				 \fn void starpu_malloc_on_node_set_default_flags(unsigned node, int flags)
			
 
				 \ingroup API_Data_Interfaces
			
 
				-Define the defaultflags for allocations performed by starpu_malloc_on_node() and
			
 
				+Define the default flags for allocations performed by starpu_malloc_on_node() and
			
 
				 starpu_free_on_node(). The default is \ref STARPU_MALLOC_PINNED | \ref STARPU_MALLOC_COUNT.
			
 
				 
			
 
				 \fn int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
			
@@ -1141,4 +1156,3 @@ Return the next available id for a newly created data interface
 
				 (\ref DefiningANewDataInterface).
			
 
				 
			
 
				 */
			
 
				-
			
--- a/doc/doxygen/chapters/api/data_management.doxy
+++ b/doc/doxygen/chapters/api/data_management.doxy
@@ -30,55 +30,48 @@ concurrent data accesses, see \ref ConcurrentDataAccess for the details.
 
				 \ingroup API_Data_Management
			
 
				 This datatype describes a data access mode.
			
 
				 \var starpu_data_access_mode::STARPU_NONE
			
 
				-\ingroup API_Data_Management
			
 
				-TODO
			
 
				+    TODO
			
 
				 \var starpu_data_access_mode::STARPU_R
			
 
				-\ingroup API_Data_Management
			
 
				-read-only mode.
			
 
				+    read-only mode.
			
 
				 \var starpu_data_access_mode::STARPU_W
			
 
				-\ingroup API_Data_Management
			
 
				-write-only mode.
			
 
				+    write-only mode.
			
 
				 \var starpu_data_access_mode::STARPU_RW
			
 
				-\ingroup API_Data_Management
			
 
				-read-write mode. This is equivalent to ::STARPU_R|::STARPU_W
			
 
				+    read-write mode. This is equivalent to ::STARPU_R|::STARPU_W
			
 
				 \var starpu_data_access_mode::STARPU_SCRATCH
			
 
				-\ingroup API_Data_Management
			
 
				-A temporary buffer is allocated for the task, but StarPU does not
			
 
				-enforce data consistency---i.e. each device has its own buffer,
			
 
				-independently from each other (even for CPUs), and no data transfer is
			
 
				-ever performed. This is useful for temporary variables to avoid
			
 
				-allocating/freeing buffers inside each task. Currently, no behavior is
			
 
				-defined concerning the relation with the ::STARPU_R and ::STARPU_W modes
			
 
				-and the value provided at registration --- i.e., the value of the
			
 
				-scratch buffer is undefined at entry of the codelet function.  It is
			
 
				-being considered for future extensions at least to define the initial
			
 
				-value.  For now, data to be used in ::STARPU_SCRATCH mode should be
			
 
				-registered with node <c>-1</c> and a <c>NULL</c> pointer, since the
			
 
				-value of the provided buffer is simply ignored for now.
			
 
				+    A temporary buffer is allocated for the task, but StarPU does not
			
 
				+    enforce data consistency---i.e. each device has its own buffer,
			
 
				+    independently from each other (even for CPUs), and no data
			
 
				+    transfer is ever performed. This is useful for temporary variables
			
 
				+    to avoid allocating/freeing buffers inside each task. Currently,
			
 
				+    no behavior is defined concerning the relation with the ::STARPU_R
			
 
				+    and ::STARPU_W modes and the value provided at registration ---
			
 
				+    i.e., the value of the scratch buffer is undefined at entry of the
			
 
				+    codelet function.  It is being considered for future extensions at
			
 
				+    least to define the initial value.  For now, data to be used in
			
 
				+    ::STARPU_SCRATCH mode should be registered with node -1 and
			
 
				+    a <c>NULL</c> pointer, since the value of the provided buffer is
			
 
				+    simply ignored for now.
			
 
				 \var starpu_data_access_mode::STARPU_REDUX
			
 
				-\ingroup API_Data_Management
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_data_access_mode::STARPU_COMMUTE
			
 
				-\ingroup API_Data_Management
			
 
				-In addition to that, ::STARPU_COMMUTE can be passed along ::STARPU_W
			
 
				-or ::STARPU_RW to express that StarPU can let tasks commute, which is
			
 
				-useful e.g. when bringing a contribution into some data, which can be
			
 
				-done in any order (but still require sequential consistency against
			
 
				-reads or non-commutative writes).
			
 
				+    ::STARPU_COMMUTE can be passed along
			
 
				+    ::STARPU_W or ::STARPU_RW to express that StarPU can let tasks
			
 
				+    commute, which is useful e.g. when bringing a contribution into
			
 
				+    some data, which can be done in any order (but still require
			
 
				+    sequential consistency against reads or non-commutative writes).
			
 
				 \var starpu_data_access_mode::STARPU_SSEND
			
 
				-\ingroup API_Data_Management
			
 
				-used in starpu_mpi_insert_task() to specify the data has to be sent
			
 
				-using a synchronous and non-blocking mode (see starpu_mpi_issend())
			
 
				+    used in starpu_mpi_insert_task() to specify the data has to be
			
 
				+    sent using a synchronous and non-blocking mode (see
			
 
				+    starpu_mpi_issend())
			
 
				 \var starpu_data_access_mode::STARPU_LOCALITY
			
 
				-\ingroup API_Data_Management
			
 
				-used to tell the scheduler which data is the most important for the task, and
			
 
				-should thus be used to try to group tasks on the same core or cache, etc. For
			
 
				-now only the ws and lws schedulers take this flag into account, and only when
			
 
				-rebuild with USE_LOCALITY flag defined in the
			
 
				-src/sched_policies/work_stealing_policy.c source code.
			
 
				+    used to tell the scheduler which data is the most important for
			
 
				+    the task, and should thus be used to try to group tasks on the
			
 
				+    same core or cache, etc. For now only the ws and lws schedulers
			
 
				+    take this flag into account, and only when rebuild with
			
 
				+    USE_LOCALITY flag defined in the
			
 
				+    src/sched_policies/work_stealing_policy.c source code.
			
 
				 \var starpu_data_access_mode::STARPU_ACCESS_MODE_MAX
			
 
				-\ingroup API_Data_Management
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 @name Basic Data Management API
			
 
				 \ingroup API_Data_Management
			
@@ -145,7 +138,7 @@ same interface as the handle \p handlesrc.
 
				 
			
 
				 \fn void starpu_data_unregister(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Management
			
 
				-This function unregisters a data handle from StarPU. If the
			
 
				+Unregister a data \p handle from StarPU. If the
			
 
				 data was automatically allocated by StarPU because the home node was
			
 
				 -1, all automatically allocated buffers are freed. Otherwise, a valid
			
 
				 copy of the data is put back into the home node in the buffer that was
			
@@ -162,61 +155,61 @@ buffer that was initially registered.
 
				 
			
 
				 \fn void starpu_data_unregister_submit(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Management
			
 
				-Destroy the data handle once it is not needed anymore by any
			
 
				+Destroy the data \p handle once it is not needed anymore by any
			
 
				 submitted task. No coherency is assumed.
			
 
				 
			
 
				 \fn void starpu_data_invalidate(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Management
			
 
				-Destroy all replicates of the data handle immediately. After
			
 
				-data invalidation, the first access to the handle must be performed in
			
 
				-write-only mode. Accessing an invalidated data in read-mode results in
			
 
				-undefined behaviour.
			
 
				+Destroy all replicates of the data \p handle immediately. After
			
 
				+data invalidation, the first access to \p handle must be performed in
			
 
				+::STARPU_W mode. Accessing an invalidated data in ::STARPU_R mode
			
 
				+results in undefined behaviour.
			
 
				 
			
 
				 \fn void starpu_data_invalidate_submit(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Management
			
 
				-Submits invalidation of the data handle after completion of
			
 
				+Submit invalidation of the data \p handle after completion of
			
 
				 previously submitted tasks.
			
 
				 
			
 
				 \fn void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask)
			
 
				 \ingroup API_Data_Management
			
 
				-This function sets the write-through mask of a given data (and
			
 
				+Set the write-through mask of the data \p handle (and
			
 
				 its children), i.e. a bitmask of nodes where the data should be always
			
 
				 replicated after modification. It also prevents the data from being
			
 
				 evicted from these nodes when memory gets scarse. When the data is
			
 
				-modified, it is automatically transfered into those memory node. For
			
 
				+modified, it is automatically transfered into those memory nodes. For
			
 
				 instance a <c>1<<0</c> write-through mask means that the CUDA workers
			
 
				 will commit their changes in main memory (node 0).
			
 
				 
			
 
				 \fn int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 \ingroup API_Data_Management
			
 
				-Issue a fetch request for a given data to a given node, i.e.
			
 
				+Issue a fetch request for the data \p handle to \p node, i.e.
			
 
				 requests that the data be replicated to the given node as soon as possible, so that it is
			
 
				-available there for tasks. If the \p async parameter is 0, the call will
			
 
				+available there for tasks. If \p async is 0, the call will
			
 
				 block until the transfer is achieved, else the call will return immediately,
			
 
				 after having just queued the request. In the latter case, the request will
			
 
				 asynchronously wait for the completion of any task writing on the data.
			
 
				 
			
 
				 \fn int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 \ingroup API_Data_Management
			
 
				-Issue a prefetch request for a given data to a given node, i.e.
			
 
				-requests that the data be replicated to the given node when there is room for it, so that it is
			
 
				-available there for tasks. If the \p async parameter is 0, the call will
			
 
				+Issue a prefetch request for the data \p handle to \p node, i.e.
			
 
				+requests that the data be replicated to \p node when there is room for it, so that it is
			
 
				+available there for tasks. If \p async is 0, the call will
			
 
				 block until the transfer is achieved, else the call will return immediately,
			
 
				 after having just queued the request. In the latter case, the request will
			
 
				 asynchronously wait for the completion of any task writing on the data.
			
 
				 
			
 
				 \fn int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 \ingroup API_Data_Management
			
 
				-Issue an idle prefetch request for a given data to a given node, i.e.
			
 
				-requests that the data be replicated to the given node, so that it is
			
 
				-available there for tasks, but only when the bus is really idle. If the \p async parameter is 0, the call will
			
 
				+Issue an idle prefetch request for the data \p handle to \p node, i.e.
			
 
				+requests that the data be replicated to \p node, so that it is
			
 
				+available there for tasks, but only when the bus is really idle. If \p async is 0, the call will
			
 
				 block until the transfer is achieved, else the call will return immediately,
			
 
				 after having just queued the request. In the latter case, the request will
			
 
				 asynchronously wait for the completion of any task writing on the data.
			
 
				 
			
 
				 \fn void starpu_data_wont_use(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Management
			
 
				-Advise StarPU that this handle will not be used in the close future, and is
			
 
				+Advise StarPU that \p handle will not be used in the close future, and is
			
 
				 thus a good candidate for eviction from GPUs. StarPU will thus write its value
			
 
				 back to its home node when the bus is idle, and select this data in priority
			
 
				 for eviction when memory gets low.
			
@@ -228,7 +221,7 @@ Return the handle corresponding to the data pointed to by the \p ptr host pointe
 
				 \fn int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
			
 
				 \ingroup API_Data_Management
			
 
				 Explicitly ask StarPU to allocate room for a piece of data on
			
 
				-the specified memory node.
			
 
				+the specified memory \p node.
			
 
				 
			
 
				 \fn void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
			
 
				 \ingroup API_Data_Management
			
@@ -236,13 +229,12 @@ Query the status of \p handle on the specified \p memory_node.
 
				 
			
 
				 \fn void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important)
			
 
				 \ingroup API_Data_Management
			
 
				-This function allows to specify that a piece of data can be
			
 
				-discarded without impacting the application.
			
 
				+Specify that the data \p handle can be discarded without impacting the application.
			
 
				 
			
 
				 \fn void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl)
			
 
				 \ingroup API_Data_Management
			
 
				-This sets the codelets to be used for \p handle when it is
			
 
				-accessed in the mode ::STARPU_REDUX. Per-worker buffers will be initialized with
			
 
				+Set the codelets to be used for \p handle when it is accessed in the
			
 
				+mode ::STARPU_REDUX. Per-worker buffers will be initialized with
			
 
				 the codelet \p init_cl, and reduction between per-worker buffers will be
			
 
				 done with the codelet \p redux_cl.
			
 
				 
			
@@ -252,14 +244,14 @@ todo
 
				 
			
 
				 \fn void starpu_data_set_user_data(starpu_data_handle_t handle, void* user_data)
			
 
				 \ingroup API_Data_Management
			
 
				-This sets the "user_data" field for the \p handle to \p user_data . It can
			
 
				-then be retrieved with starpu_data_get_user_data. \p user_data can be any
			
 
				+Sset the field \c user_data for the \p handle to \p user_data . It can
			
 
				+then be retrieved with starpu_data_get_user_data(). \p user_data can be any
			
 
				 application-defined value, for instance a pointer to an object-oriented
			
 
				 container for the data.
			
 
				 
			
 
				 \fn void *starpu_data_get_user_data(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Management
			
 
				-This retrieves the "user_data" field previously set for the \p handle .
			
 
				+This retrieves the field \c user_data previously set for the \p handle.
			
 
				 
			
 
				 @name Access registered data from the application
			
 
				 \ingroup API_Data_Management
			
@@ -268,10 +260,10 @@ This retrieves the "user_data" field previously set for the \p handle .
 
				 \ingroup API_Data_Management
			
 
				 The application must call this function prior to accessing
			
 
				 registered data from main memory outside tasks. StarPU ensures that
			
 
				-the application will get an up-to-date copy of the data in main memory
			
 
				+the application will get an up-to-date copy of \p handle in main memory
			
 
				 located where the data was originally registered, and that all
			
 
				 concurrent accesses (e.g. from tasks) will be consistent with the
			
 
				-access mode specified in the mode argument. starpu_data_release() must
			
 
				+access mode specified with \p mode. starpu_data_release() must
			
 
				 be called once the application does not need to access the piece of
			
 
				 data anymore. Note that implicit data dependencies are also enforced
			
 
				 by starpu_data_acquire(), i.e. starpu_data_acquire() will wait for all
			
@@ -285,9 +277,9 @@ successful completion, this function returns 0.
 
				 \fn int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_Data_Management
			
 
				 Asynchronous equivalent of starpu_data_acquire(). When the data
			
 
				-specified in \p handle is available in the appropriate access
			
 
				-mode, the \p callback function is executed. The application may access
			
 
				-the requested data during the execution of this \p callback. The \p callback
			
 
				+specified in \p handle is available in the access \p mode, the \p
			
 
				+callback function is executed. The application may access
			
 
				+the requested data during the execution of \p callback. The \p callback
			
 
				 function must call starpu_data_release() once the application does not
			
 
				 need to access the piece of data anymore. Note that implicit data
			
 
				 dependencies are also enforced by starpu_data_acquire_cb() in case they
			
@@ -298,8 +290,8 @@ completion, this function returns 0.
 
				 \fn int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
			
 
				 \ingroup API_Data_Management
			
 
				 Equivalent of starpu_data_acquire_cb() with the possibility of enabling or disabling data dependencies.
			
 
				-When the data specified in \p handle is available in the appropriate access
			
 
				-mode, the \p callback function is executed. The application may access
			
 
				+When the data specified in \p handle is available in the access
			
 
				+\p mode, the \p callback function is executed. The application may access
			
 
				 the requested data during the execution of this \p callback. The \p callback
			
 
				 function must call starpu_data_release() once the application does not
			
 
				 need to access the piece of data anymore. Note that implicit data
			
@@ -324,16 +316,16 @@ This is mostly useful inside starpu only.
 
				 This is the same as starpu_data_acquire(), except that the data
			
 
				 will be available on the given memory node instead of main
			
 
				 memory.
			
 
				-::STARPU_ACQUIRE_NO_NODE and ::STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be used instead of an
			
 
				-explicit node number.
			
 
				+::STARPU_ACQUIRE_NO_NODE and ::STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be
			
 
				+used instead of an explicit node number.
			
 
				 
			
 
				 \fn int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_Data_Management
			
 
				 This is the same as starpu_data_acquire_cb(), except that the
			
 
				 data will be available on the given memory node instead of main
			
 
				 memory.
			
 
				-::STARPU_ACQUIRE_NO_NODE and ::STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be used instead of an
			
 
				-explicit node number.
			
 
				+::STARPU_ACQUIRE_NO_NODE and ::STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be
			
 
				+used instead of an explicit node number.
			
 
				 
			
 
				 \fn int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
			
 
				 \ingroup API_Data_Management
			
@@ -354,7 +346,7 @@ called from task callbacks.
 
				 
			
 
				 \fn void starpu_data_release(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Management
			
 
				-This function releases the piece of data acquired by the
			
 
				+Release the piece of data acquired by the
			
 
				 application either by starpu_data_acquire() or by
			
 
				 starpu_data_acquire_cb().
			
 
				 
			
@@ -362,19 +354,20 @@ starpu_data_acquire_cb().
 
				 \ingroup API_Data_Management
			
 
				 This is the same as starpu_data_release(), except that the data
			
 
				 will be available on the given memory \p node instead of main memory.
			
 
				-The \p node parameter must be exactly the same as the corresponding starpu_data_acquire_on_node* call.
			
 
				+The \p node parameter must be exactly the same as the corresponding \c
			
 
				+starpu_data_acquire_on_node* call.
			
 
				 
			
 
				 \fn starpu_arbiter_t starpu_arbiter_create(void)
			
 
				 \ingroup API_Data_Management
			
 
				-This creates a data access arbiter, see \ref ConcurrentDataAccess for the details
			
 
				+Create a data access arbiter, see \ref ConcurrentDataAccess for the details
			
 
				 
			
 
				 \fn void starpu_data_assign_arbiter(starpu_data_handle_t handle, starpu_arbiter_t arbiter)
			
 
				 \ingroup API_Data_Management
			
 
				-This makes accesses to \p handle managed by \p arbiter
			
 
				+Make access to \p handle managed by \p arbiter
			
 
				 
			
 
				 \fn void starpu_arbiter_destroy(starpu_arbiter_t arbiter)
			
 
				 \ingroup API_Data_Management
			
 
				-This destroys the \p arbiter . This must only be called after all data assigned
			
 
				-to it have been unregistered.
			
 
				+Destroy the \p arbiter . This must only be called after all data
			
 
				+assigned to it have been unregistered.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/api/data_partition.doxy
+++ b/doc/doxygen/chapters/api/data_partition.doxy
@@ -13,31 +13,32 @@ The filter structure describes a data partitioning operation, to be
 
				 given to the starpu_data_partition() function.
			
 
				 \ingroup API_Data_Partition
			
 
				 \var void (*starpu_data_filter::filter_func)(void *father_interface, void *child_interface, struct starpu_data_filter *, unsigned id, unsigned nparts)
			
 
				-This function fills the \p child_interface structure with interface
			
 
				-information for the \p id -th child of the parent \p father_interface (among
			
 
				-\p nparts).
			
 
				+    Fill the \p child_interface structure with interface information
			
 
				+    for the \p id -th child of the parent \p father_interface (among
			
 
				+    \p nparts).
			
 
				 \var unsigned starpu_data_filter::nchildren
			
 
				-This is the number of parts to partition the data into.
			
 
				+    Number of parts to partition the data into.
			
 
				 \var unsigned (*starpu_data_filter::get_nchildren)(struct starpu_data_filter *, starpu_data_handle_t initial_handle)
			
 
				-This returns the number of children. This can be used instead of
			
 
				-starpu_data_filter::nchildren when the number of children depends on the actual data (e.g.
			
 
				-the number of blocks in a sparse matrix).
			
 
				+    Return the number of children. This can be used instead of
			
 
				+    starpu_data_filter::nchildren when the number of children depends
			
 
				+    on the actual data (e.g. the number of blocks in a sparse matrix).
			
 
				 \var struct starpu_data_interface_ops *(*starpu_data_filter::get_child_ops)(struct starpu_data_filter *, unsigned id)
			
 
				-In case the resulting children use a different data interface, this
			
 
				-function returns which interface is used by child number \p id.
			
 
				+    In case the resulting children use a different data interface,
			
 
				+    this function returns which interface is used by child number \p
			
 
				+    id.
			
 
				 \var unsigned starpu_data_filter::filter_arg
			
 
				-Allow to define an additional parameter for the filter function.
			
 
				+    Allow to define an additional parameter for the filter function.
			
 
				 \var void *starpu_data_filter::filter_arg_ptr
			
 
				-Allow to define an additional pointer parameter for the filter
			
 
				-function, such as the sizes of the different parts.
			
 
				+    Allow to define an additional pointer parameter for the filter
			
 
				+    function, such as the sizes of the different parts.
			
 
				 
			
 
				 @name Basic API
			
 
				 \ingroup API_Data_Partition
			
 
				 
			
 
				 \fn void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
			
 
				 \ingroup API_Data_Partition
			
 
				-This requests partitioning one StarPU data \p initial_handle into
			
 
				-several subdata according to the filter \p f.
			
 
				+Request the partitioning of \p initial_handle into several subdata
			
 
				+according to the filter \p f.
			
 
				 
			
 
				 Here an example of how to use the function.
			
 
				 \code{.c}
			
@@ -50,10 +51,11 @@ starpu_data_partition(A_handle, &f);
 
				 
			
 
				 \fn void starpu_data_unpartition(starpu_data_handle_t root_data, unsigned gathering_node)
			
 
				 \ingroup API_Data_Partition
			
 
				-This unapplies one filter, thus unpartitioning the data. The
			
 
				-pieces of data are collected back into one big piece in the
			
 
				-\p gathering_node (usually ::STARPU_MAIN_RAM). Tasks working on the partitioned data must
			
 
				-be already finished when calling starpu_data_unpartition().
			
 
				+Unapply the filter which has been applied to \p root_data, thus
			
 
				+unpartitioning the data. The pieces of data are collected back into
			
 
				+one big piece in the \p gathering_node (usually ::STARPU_MAIN_RAM).
			
 
				+Tasks working on the partitioned data must be already finished when
			
 
				+calling starpu_data_unpartition().
			
 
				 
			
 
				 Here an example of how to use the function.
			
 
				 \code{.c}
			
@@ -62,7 +64,7 @@ starpu_data_unpartition(A_handle, STARPU_MAIN_RAM);
 
				 
			
 
				 \fn int starpu_data_get_nb_children(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Partition
			
 
				-This function returns the number of children.
			
 
				+Return the number of children \p handle has been partitioned into.
			
 
				 
			
 
				 \fn starpu_data_handle_t starpu_data_get_child(starpu_data_handle_t handle, unsigned i)
			
 
				 \ingroup API_Data_Partition
			
@@ -91,13 +93,13 @@ va_list for the parameter list.
 
				 
			
 
				 \fn void starpu_data_map_filters(starpu_data_handle_t root_data, unsigned nfilters, ...)
			
 
				 \ingroup API_Data_Partition
			
 
				-Applies \p nfilters filters to the handle designated by
			
 
				+Apply \p nfilters filters to the handle designated by
			
 
				 \p root_handle recursively. \p nfilters pointers to variables of the type
			
 
				 starpu_data_filter should be given.
			
 
				 
			
 
				 \fn void starpu_data_vmap_filters(starpu_data_handle_t root_data, unsigned nfilters, va_list pa)
			
 
				 \ingroup API_Data_Partition
			
 
				-Applies \p nfilters filters to the handle designated by
			
 
				+Apply \p nfilters filters to the handle designated by
			
 
				 \p root_handle recursively. It uses a va_list of pointers to variables of
			
 
				 the type starpu_data_filter.
			
 
				 
			
@@ -106,11 +108,12 @@ the type starpu_data_filter.
 
				 
			
 
				 \fn void starpu_data_partition_plan(starpu_data_handle_t initial_handle, struct starpu_data_filter *f, starpu_data_handle_t *children)
			
 
				 \ingroup API_Data_Partition
			
 
				-This plans for partitioning one StarPU data handle \p initial_handle into
			
 
				-several subdata according to the filter \p f. The handles are returned into
			
 
				-the \p children array, which has to be the same size as the number of parts
			
 
				-described in \p f. These handles are not immediately usable,
			
 
				-starpu_data_partition_submit() has to be called to submit the actual partitioning.
			
 
				+Plan to partition \p initial_handle into several subdata according to
			
 
				+the filter \p f.
			
 
				+The handles are returned into the \p children array, which has to be
			
 
				+the same size as the number of parts described in \p f. These handles
			
 
				+are not immediately usable, starpu_data_partition_submit() has to be
			
 
				+called to submit the actual partitioning.
			
 
				 
			
 
				 Here is an example of how to use the function:
			
 
				 
			
@@ -125,8 +128,7 @@ starpu_data_partition_plan(A_handle, &f, children);
 
				 
			
 
				 \fn void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				 \ingroup API_Data_Partition
			
 
				-
			
 
				-This submits the actual partitioning of \p initial_handle into the \p nparts
			
 
				+Submit the actual partitioning of \p initial_handle into the \p nparts
			
 
				 \p children handles. This call is asynchronous, it only submits that the
			
 
				 partitioning should be done, so that the \p children handles can now be used to
			
 
				 submit tasks, and \p initial_handle can not be used to submit tasks any more (to
			
@@ -140,7 +142,6 @@ starpu_data_partition_submit(A_handle, nslicesx, children);
 
				 
			
 
				 \fn void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				 \ingroup API_Data_Partition
			
 
				-
			
 
				 This is the same as starpu_data_partition_submit(), but it does not invalidate \p
			
 
				 initial_handle. This allows to continue using it, but the application has to be
			
 
				 careful not to write to \p initial_handle or \p children handles, only read from
			
@@ -158,7 +159,6 @@ submitted.
 
				 
			
 
				 \fn void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children)
			
 
				 \ingroup API_Data_Partition
			
 
				-
			
 
				 This assumes that a partitioning of \p initial_handle has already been submited
			
 
				 in readonly mode through starpu_data_partition_readonly_submit(), and will upgrade
			
 
				 that partitioning into read-write mode for the \p children, by invalidating \p
			
@@ -166,7 +166,6 @@ initial_handle, and adding the necessary dependencies.
 
				 
			
 
				 \fn void starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node)
			
 
				 \ingroup API_Data_Partition
			
 
				-
			
 
				 This assumes that \p initial_handle is partitioned into \p children, and submits
			
 
				 an unpartitionning of it, i.e. submitting a gathering of the pieces on the
			
 
				 requested \p gathering_node memory node, and submitting an invalidation of the
			
@@ -177,7 +176,6 @@ should be used to gather the pieces.
 
				 
			
 
				 \fn void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node)
			
 
				 \ingroup API_Data_Partition
			
 
				-
			
 
				 This assumes that \p initial_handle is partitioned into \p children, and submits
			
 
				 just a readonly unpartitionning of it, i.e. submitting a gathering of the pieces
			
 
				 on the requested \p gathering_node memory node. It does not invalidate the
			
@@ -189,7 +187,6 @@ should be used to gather the pieces.
 
				 
			
 
				 \fn void starpu_data_partition_clean(starpu_data_handle_t root_data, unsigned nparts, starpu_data_handle_t *children)
			
 
				 \ingroup API_Data_Partition
			
 
				-
			
 
				 This should be used to clear the partition planning established between \p
			
 
				 root_data and \p children with starpu_data_partition_plan(). This will notably
			
 
				 submit an unregister all the \p children, which can thus not be used any more
			
@@ -246,13 +243,13 @@ functions for matrix data. Examples on how to use them are shown in
 
				 
			
 
				 \fn void starpu_matrix_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a dense Matrix along the x dimension, thus
			
 
				+Partition a dense Matrix along the x dimension, thus
			
 
				 getting (x/\p nparts ,y) matrices. If \p nparts does not divide x, the
			
 
				 last submatrix contains the remainder.
			
 
				 
			
 
				 \fn void starpu_matrix_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a dense Matrix along the x dimension, with a
			
 
				+Partition a dense Matrix along the x dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting ((x-2*shadow)/\p
			
 
				 nparts +2*shadow,y) matrices. If \p nparts does not divide x-2*shadow,
			
 
				 the last submatrix contains the remainder.
			
@@ -264,13 +261,13 @@ examples/filters/shadow2d.c
 
				 
			
 
				 \fn void starpu_matrix_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a dense Matrix along the y dimension, thus
			
 
				+Partition a dense Matrix along the y dimension, thus
			
 
				 getting (x,y/\p nparts) matrices. If \p nparts does not divide y, the
			
 
				 last submatrix contains the remainder.
			
 
				 
			
 
				 \fn void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a dense Matrix along the y dimension, with a
			
 
				+Partition a dense Matrix along the y dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
 
				 (x,(y-2*shadow)/\p nparts +2*shadow) matrices. If \p nparts does not
			
 
				 divide y-2*shadow, the last submatrix contains the remainder.
			
@@ -290,13 +287,13 @@ examples/filters/shadow3d.c
 
				 
			
 
				 \fn void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a block along the X dimension, thus getting
			
 
				+Partition a block along the X dimension, thus getting
			
 
				 (x/\p nparts ,y,z) 3D matrices. If \p nparts does not divide x, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				 \fn void starpu_block_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a block along the X dimension, with a
			
 
				+Partition a block along the X dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
 
				 ((x-2*shadow)/\p nparts +2*shadow,y,z) blocks. If \p nparts does not
			
 
				 divide x, the last submatrix contains the remainder.
			
@@ -307,13 +304,13 @@ enforced for the shadowed parts.
 
				 
			
 
				 \fn void starpu_block_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a block along the Y dimension, thus getting
			
 
				+Partition a block along the Y dimension, thus getting
			
 
				 (x,y/\p nparts ,z) blocks. If \p nparts does not divide y, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				 \fn void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a block along the Y dimension, with a
			
 
				+Partition a block along the Y dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
 
				 (x,(y-2*shadow)/\p nparts +2*shadow,z) 3D matrices. If \p nparts does not
			
 
				 divide y, the last submatrix contains the remainder.
			
@@ -324,13 +321,13 @@ enforced for the shadowed parts.
 
				 
			
 
				 \fn void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a block along the Z dimension, thus getting
			
 
				+Partition a block along the Z dimension, thus getting
			
 
				 (x,y,z/\p nparts) blocks. If \p nparts does not divide z, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				 \fn void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a block along the Z dimension, with a
			
 
				+Partition a block along the Z dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
 
				 (x,y,(z-2*shadow)/\p nparts +2*shadow) blocks. If \p nparts does not
			
 
				 divide z, the last submatrix contains the remainder.
			
@@ -349,11 +346,10 @@ functions for BCSR data. Examples on how to use them are shown in
 
				 
			
 
				 \fn void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a block-sparse matrix into dense matrices.
			
 
				+Partition a block-sparse matrix into dense matrices.
			
 
				 
			
 
				 \fn void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				-This partitions a block-sparse matrix into vertical
			
 
				-block-sparse matrices.
			
 
				+Partition a block-sparse matrix into vertical block-sparse matrices.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/api/explicit_dependencies.doxy
+++ b/doc/doxygen/chapters/api/explicit_dependencies.doxy
@@ -24,7 +24,7 @@ redundancy in the task dependencies.
 
				 
			
 
				 \fn int starpu_task_get_task_succs(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[])
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-Fills \p task_array with the list of tasks which are direct children of \p task.
			
 
				+Fill \p task_array with the list of tasks which are direct children of \p task.
			
 
				 \p ndeps is the size of \p task_array.  This function returns the number of
			
 
				 direct children. \p task_array can be set to <c>NULL</c> if \p ndeps is 0, which allows
			
 
				 to compute the number of children before allocating an array to store them.
			
@@ -34,13 +34,13 @@ dependency has been added in the meanwhile.
 
				 
			
 
				 \fn int starpu_task_get_task_scheduled_succs(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[])
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-This behaves like starpu_task_get_task_succs(), except that it only reports
			
 
				+Behave like starpu_task_get_task_succs(), except that it only reports
			
 
				 tasks which will go through the scheduler, thus avoiding tasks with not codelet,
			
 
				 or with explicit placement.
			
 
				 
			
 
				 \typedef starpu_tag_t
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-This type defines a task logical identifer. It is possible to
			
 
				+Define a task logical identifer. It is possible to
			
 
				 associate a task with a unique <em>tag</em> chosen by the application,
			
 
				 and to express dependencies between tasks by the means of those tags.
			
 
				 To do so, fill the field starpu_task::tag_id with a tag number (can be
			
@@ -74,7 +74,7 @@ starpu_tag_declare_deps((starpu_tag_t)0x1, 2, (starpu_tag_t)0x32, (starpu_tag_t)
 
				 
			
 
				 \fn void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array)
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-This function is similar to starpu_tag_declare_deps(), except
			
 
				+Similar to starpu_tag_declare_deps(), except
			
 
				 that its does not take a variable number of arguments but an \p array of
			
 
				 tags of size \p ndeps.
			
 
				 
			
@@ -86,7 +86,7 @@ starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
 
				 
			
 
				 \fn int starpu_tag_wait(starpu_tag_t id)
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-This function blocks until the task associated to tag \p id has
			
 
				+Block until the task associated to tag \p id has
			
 
				 been executed. This is a blocking call which must therefore not be
			
 
				 called within tasks or callbacks, but only from the application
			
 
				 directly. It is possible to synchronize with the same tag multiple
			
@@ -97,27 +97,26 @@ starpu_task::destroy was enabled).
 
				 
			
 
				 \fn int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-This function is similar to starpu_tag_wait() except that it
			
 
				+Similar to starpu_tag_wait() except that it
			
 
				 blocks until all the \p ntags tags contained in the array \p id are
			
 
				 terminated.
			
 
				 
			
 
				 \fn void starpu_tag_restart(starpu_tag_t id)
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-This function can be used to clear the <em>already
			
 
				-notified</em> status of a tag which is not associated with a task.
			
 
				+Clear the <em>already notified</em> status of a tag which is not associated with a task.
			
 
				 Before that, calling starpu_tag_notify_from_apps() again will not
			
 
				 notify the successors. After that, the next call to
			
 
				 starpu_tag_notify_from_apps() will notify the successors.
			
 
				 
			
 
				 \fn void starpu_tag_remove(starpu_tag_t id)
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-This function releases the resources associated to tag \p id.
			
 
				+Release the resources associated to tag \p id.
			
 
				 It can be called once the corresponding task has been executed and
			
 
				 when there is no other tag that depend on this tag anymore.
			
 
				 
			
 
				 \fn void starpu_tag_notify_from_apps(starpu_tag_t id)
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				-This function explicitly unlocks tag \p id. It may be useful in
			
 
				+Explicitly unlock tag \p id. It may be useful in
			
 
				 the case of applications which execute part of their computation
			
 
				 outside StarPU tasks (e.g. third-party libraries). It is also provided
			
 
				 as a convenient tool for the programmer, for instance to entirely
			
--- a/doc/doxygen/chapters/api/fft_support.doxy
+++ b/doc/doxygen/chapters/api/fft_support.doxy
@@ -18,7 +18,7 @@ todo
 
				 
			
 
				 \fn void * starpufft_malloc(size_t n)
			
 
				 \ingroup API_FFT_Support
			
 
				-Allocates memory for \p n bytes. This is preferred over \c malloc(),
			
 
				+Allocate memory for \p n bytes. This is preferred over \c malloc(),
			
 
				 since it allocates pinned memory, which allows overlapped transfers.
			
 
				 
			
 
				 \fn void * starpufft_free(void *p)
			
@@ -27,12 +27,12 @@ Release memory previously allocated.
 
				 
			
 
				 \fn struct starpufft_plan * starpufft_plan_dft_1d(int n, int sign, unsigned flags)
			
 
				 \ingroup API_FFT_Support
			
 
				-Initializes a plan for 1D FFT of size \p n. \p sign can be STARPUFFT_FORWARD
			
 
				+Initialize a plan for 1D FFT of size \p n. \p sign can be STARPUFFT_FORWARD
			
 
				 or STARPUFFT_INVERSE. \p flags must be 0.
			
 
				 
			
 
				 \fn struct starpufft_plan * starpufft_plan_dft_2d(int n, int m, int sign, unsigned flags)
			
 
				 \ingroup API_FFT_Support
			
 
				-Initializes a plan for 2D FFT of size (\p n, \p m). \p sign can be
			
 
				+Initialize a plan for 2D FFT of size (\p n, \p m). \p sign can be
			
 
				 STARPUFFT_FORWARD or STARPUFFT_INVERSE. flags must be \p 0.
			
 
				 
			
 
				 \fn struct starpu_task * starpufft_start(starpufft_plan p, void *in, void *out)
			
@@ -60,11 +60,11 @@ the expected types). This submits and waits for the task.
 
				 
			
 
				 \fn void starpufft_cleanup(starpufft_plan p)
			
 
				 \ingroup API_FFT_Support
			
 
				-Releases data for plan \p p, in the starpufft_start() case.
			
 
				+Release data for plan \p p, in the starpufft_start() case.
			
 
				 
			
 
				 \fn void starpufft_destroy_plan(starpufft_plan p)
			
 
				 \ingroup API_FFT_Support
			
 
				-Destroys plan \p p, i.e. release all CPU (fftw) and GPU (cufft)
			
 
				+Destroy plan \p p, i.e. release all CPU (fftw) and GPU (cufft)
			
 
				 resources.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/api/fxt_support.doxy
+++ b/doc/doxygen/chapters/api/fxt_support.doxy
@@ -12,47 +12,58 @@
 
				 todo
			
 
				 \ingroup API_FxT_Support
			
 
				 \var char starpu_fxt_codelet_event::symbol[256]
			
 
				-name of the codelet
			
 
				+    name of the codelet
			
 
				 \var int starpu_fxt_codelet_event::workerid
			
 
				+    todo
			
 
				 \var char starpu_fxt_codelet_event::perfmodel_archname[256]
			
 
				+    todo
			
 
				 \var uint32_t starpu_fxt_codelet_event::hash
			
 
				+    todo
			
 
				 \var size_t starpu_fxt_codelet_event::size
			
 
				+    todo
			
 
				 \var float starpu_fxt_codelet_event::time
			
 
				+    todo
			
 
				 
			
 
				 \struct starpu_fxt_options
			
 
				 todo
			
 
				 \ingroup API_FxT_Support
			
 
				 \var unsigned starpu_fxt_options::per_task_colour
			
 
				+    todo
			
 
				 \var unsigned starpu_fxt_options::no_counter
			
 
				+    todo
			
 
				 \var starpu_unsigned fxt_options::no_bus
			
 
				+    todo
			
 
				 \var unsigned starpu_fxt_options::ninputfiles
			
 
				+    todo
			
 
				 \var char *starpu_fxt_options::filenames[STARPU_FXT_MAX_FILES]
			
 
				+    todo
			
 
				 \var char *starpu_fxt_options::out_paje_path
			
 
				+    todo
			
 
				 \var char *starpu_fxt_options::distrib_time_path
			
 
				+    todo
			
 
				 \var char *starpu_fxt_options::activity_path
			
 
				+    todo
			
 
				 \var char *starpu_fxt_options::dag_path
			
 
				-
			
 
				+    todo
			
 
				 \var char *starpu_fxt_options::file_prefix
			
 
				-In case we are going to gather multiple traces (e.g in the case of MPI
			
 
				-processes), we may need to prefix the name of the containers.
			
 
				+    In case we are going to gather multiple traces (e.g in the case of
			
 
				+    MPI processes), we may need to prefix the name of the containers.
			
 
				 \var uint64_t starpu_fxt_options::file_offset
			
 
				-In case we are going to gather multiple traces (e.g in the case of MPI
			
 
				-processes), we may need to prefix the name of the containers.
			
 
				+    In case we are going to gather multiple traces (e.g in the case of
			
 
				+    MPI processes), we may need to prefix the name of the containers.
			
 
				 \var int starpu_fxt_options::file_rank
			
 
				-In case we are going to gather multiple traces (e.g in the case of MPI
			
 
				-processes), we may need to prefix the name of the containers.
			
 
				-
			
 
				+    In case we are going to gather multiple traces (e.g in the case of
			
 
				+    MPI processes), we may need to prefix the name of the containers.
			
 
				 \var char starpu_fxt_options::worker_names[STARPU_NMAXWORKERS][256]
			
 
				-Output parameters
			
 
				+    Output parameters
			
 
				 \var struct starpu_perfmodel_arch starpu_fxt_options::worker_archtypes[STARPU_NMAXWORKERS]
			
 
				-Output parameters
			
 
				+    Output parameters
			
 
				 \var int starpu_fxt_options::nworkers
			
 
				-Output parameters
			
 
				-
			
 
				+    Output parameters
			
 
				 \var struct starpu_fxt_codelet_event **starpu_fxt_options::dumped_codelets
			
 
				-In case we want to dump the list of codelets to an external tool
			
 
				+    In case we want to dump the list of codelets to an external tool
			
 
				 \var long starpu_fxt_options::dumped_codelets_count
			
 
				-In case we want to dump the list of codelets to an external tool
			
 
				+    In case we want to dump the list of codelets to an external tool
			
 
				 
			
 
				 \fn void starpu_fxt_options_init(struct starpu_fxt_options *options)
			
 
				 \ingroup API_FxT_Support
			
@@ -79,7 +90,7 @@ start recording it again, etc.
 
				 
			
 
				 \fn void starpu_fxt_autostart_profiling(int autostart)
			
 
				 \ingroup API_FxT_Support
			
 
				-Determines whether profiling should be started by starpu_init(), or only when
			
 
				+Determine whether profiling should be started by starpu_init(), or only when
			
 
				 starpu_fxt_start_profiling() is called. \p autostart should be 1 to do so, or 0 to
			
 
				 prevent it.
			
 
				 
			
--- a/doc/doxygen/chapters/api/initialization.doxy
+++ b/doc/doxygen/chapters/api/initialization.doxy
@@ -17,220 +17,226 @@ number of processing units and takes the default scheduling policy.
 
				 The environment variables overwrite the equivalent parameters.
			
 
				 \var int starpu_conf::magic
			
 
				 \private
			
 
				-Will be initialized by starpu_conf_init(). Should not be set by hand.
			
 
				+    Will be initialized by starpu_conf_init(). Should not be set by
			
 
				+    hand.
			
 
				 
			
 
				 \var const char*starpu_conf::sched_policy_name
			
 
				-This is the name of the scheduling policy. This can also be specified
			
 
				-with the environment variable \ref STARPU_SCHED. (default = <c>NULL</c>).
			
 
				+    Name of the scheduling policy. This can also be specified with the
			
 
				+    environment variable \ref STARPU_SCHED. (default = <c>NULL</c>).
			
 
				 
			
 
				 \var struct starpu_sched_policy *starpu_conf::sched_policy
			
 
				-This is the definition of the scheduling policy. This field is ignored
			
 
				-if starpu_conf::sched_policy_name is set. (default = <c>NULL</c>)
			
 
				+    Definition of the scheduling policy. This field is ignored if
			
 
				+    starpu_conf::sched_policy_name is set. (default = <c>NULL</c>)
			
 
				 
			
 
				 \var void (*starpu_conf::sched_policy_init)(unsigned)
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \var int starpu_conf::ncpus
			
 
				-This is the number of CPU cores that StarPU can use. This can also be
			
 
				-specified with the environment variable \ref STARPU_NCPU . (default = -1)
			
 
				+    Number of CPU cores that StarPU can use. This can also be
			
 
				+    specified with the environment variable \ref STARPU_NCPU .
			
 
				+    (default = -1)
			
 
				 \var int starpu_conf::ncuda
			
 
				-This is the number of CUDA devices that StarPU can use. This can also
			
 
				-be specified with the environment variable \ref STARPU_NCUDA. (default =
			
 
				--1)
			
 
				+    Number of CUDA devices that StarPU can use. This can also be
			
 
				+    specified with the environment variable \ref STARPU_NCUDA.
			
 
				+    (default = -1)
			
 
				 \var int starpu_conf::nopencl
			
 
				-This is the number of OpenCL devices that StarPU can use. This can
			
 
				-also be specified with the environment variable \ref STARPU_NOPENCL.
			
 
				-(default = -1)
			
 
				+    Number of OpenCL devices that StarPU can use. This can also be
			
 
				+    specified with the environment variable \ref STARPU_NOPENCL.
			
 
				+    (default = -1)
			
 
				 \var int starpu_conf::nmic
			
 
				-This is the number of MIC devices that StarPU can use. This can also
			
 
				-be specified with the environment variable \ref STARPU_NMIC.
			
 
				-(default = -1)
			
 
				+    Number of MIC devices that StarPU can use. This can also be
			
 
				+    specified with the environment variable \ref STARPU_NMIC.
			
 
				+    (default = -1)
			
 
				 \var int starpu_conf::nscc
			
 
				-This is the number of SCC devices that StarPU can use. This can also
			
 
				-be specified with the environment variable \ref STARPU_NSCC.
			
 
				-(default = -1)
			
 
				+    Number of SCC devices that StarPU can use. This can also be
			
 
				+    specified with the environment variable \ref STARPU_NSCC.
			
 
				+    (default = -1)
			
 
				 \var int starpu_conf::nmpi_ms
			
 
				-This is the number of MPI Master Slave devices that StarPU can use. This can also
			
 
				-be specified with the environment variable \ref STARPU_NMPI_MS.
			
 
				-(default = -1)
			
 
				+    Number of MPI Master Slave devices that StarPU can use. This can
			
 
				+    also be specified with the environment variable \ref
			
 
				+    STARPU_NMPI_MS. (default = -1)
			
 
				 
			
 
				 \var unsigned starpu_conf::use_explicit_workers_bindid
			
 
				-If this flag is set, the starpu_conf::workers_bindid array indicates
			
 
				-where the different workers are bound, otherwise StarPU automatically
			
 
				-selects where to bind the different workers. This can also be
			
 
				-specified with the environment variable \ref STARPU_WORKERS_CPUID. (default = 0)
			
 
				+    If this flag is set, the starpu_conf::workers_bindid array
			
 
				+    indicates where the different workers are bound, otherwise StarPU
			
 
				+    automatically selects where to bind the different workers. This
			
 
				+    can also be specified with the environment variable \ref
			
 
				+    STARPU_WORKERS_CPUID. (default = 0)
			
 
				 \var unsigned starpu_conf::workers_bindid[STARPU_NMAXWORKERS]
			
 
				-If the starpu_conf::use_explicit_workers_bindid flag is set, this
			
 
				-array indicates where to bind the different workers. The i-th entry of
			
 
				-the starpu_conf::workers_bindid indicates the logical identifier of
			
 
				-the processor which should execute the i-th worker. Note that the
			
 
				-logical ordering of the CPUs is either determined by the OS, or
			
 
				-provided by the hwloc library in case it is available.
			
 
				+    If the starpu_conf::use_explicit_workers_bindid flag is set, this
			
 
				+    array indicates where to bind the different workers. The i-th
			
 
				+    entry of the starpu_conf::workers_bindid indicates the logical
			
 
				+    identifier of the processor which should execute the i-th worker.
			
 
				+    Note that the logical ordering of the CPUs is either determined by
			
 
				+    the OS, or provided by the hwloc library in case it is available.
			
 
				 \var unsigned starpu_conf::use_explicit_workers_cuda_gpuid
			
 
				-If this flag is set, the CUDA workers will be attached to the CUDA
			
 
				-devices specified in the starpu_conf::workers_cuda_gpuid array.
			
 
				-Otherwise, StarPU affects the CUDA devices in a round-robin fashion.
			
 
				-This can also be specified with the environment variable
			
 
				-\ref STARPU_WORKERS_CUDAID. (default = 0)
			
 
				+    If this flag is set, the CUDA workers will be attached to the CUDA
			
 
				+    devices specified in the starpu_conf::workers_cuda_gpuid array.
			
 
				+    Otherwise, StarPU affects the CUDA devices in a round-robin
			
 
				+    fashion. This can also be specified with the environment variable
			
 
				+    \ref STARPU_WORKERS_CUDAID. (default = 0)
			
 
				 \var unsigned starpu_conf::workers_cuda_gpuid[STARPU_NMAXWORKERS]
			
 
				-If the starpu_conf::use_explicit_workers_cuda_gpuid flag is set, this
			
 
				-array contains the logical identifiers of the CUDA devices (as used by
			
 
				-\c cudaGetDevice()).
			
 
				+    If the starpu_conf::use_explicit_workers_cuda_gpuid flag is set,
			
 
				+    this array contains the logical identifiers of the CUDA devices
			
 
				+    (as used by \c cudaGetDevice()).
			
 
				 \var unsigned starpu_conf::use_explicit_workers_opencl_gpuid
			
 
				-If this flag is set, the OpenCL workers will be attached to the OpenCL
			
 
				-devices specified in the starpu_conf::workers_opencl_gpuid array.
			
 
				-Otherwise, StarPU affects the OpenCL devices in a round-robin fashion.
			
 
				-This can also be specified with the environment variable
			
 
				-\ref STARPU_WORKERS_OPENCLID. (default = 0)
			
 
				+    If this flag is set, the OpenCL workers will be attached to the
			
 
				+    OpenCL devices specified in the starpu_conf::workers_opencl_gpuid
			
 
				+    array. Otherwise, StarPU affects the OpenCL devices in a
			
 
				+    round-robin fashion. This can also be specified with the
			
 
				+    environment variable \ref STARPU_WORKERS_OPENCLID. (default = 0)
			
 
				 \var unsigned starpu_conf::workers_opencl_gpuid[STARPU_NMAXWORKERS]
			
 
				-If the starpu_conf::use_explicit_workers_opencl_gpuid flag is set,
			
 
				-this array contains the logical identifiers of the OpenCL devices to
			
 
				-be used.
			
 
				+    If the starpu_conf::use_explicit_workers_opencl_gpuid flag is set,
			
 
				+    this array contains the logical identifiers of the OpenCL devices
			
 
				+    to be used.
			
 
				 \var unsigned starpu_conf::use_explicit_workers_mic_deviceid
			
 
				-If this flag is set, the MIC workers will be attached to the MIC
			
 
				-devices specified in the array starpu_conf::workers_mic_deviceid.
			
 
				-Otherwise, StarPU affects the MIC devices in a round-robin fashion.
			
 
				-This can also be specified with the environment variable
			
 
				-\ref STARPU_WORKERS_MICID.
			
 
				-(default = 0)
			
 
				+    If this flag is set, the MIC workers will be attached to the MIC
			
 
				+    devices specified in the array starpu_conf::workers_mic_deviceid.
			
 
				+    Otherwise, StarPU affects the MIC devices in a round-robin
			
 
				+    fashion. This can also be specified with the environment variable
			
 
				+    \ref STARPU_WORKERS_MICID. (default = 0)
			
 
				 \var unsigned starpu_conf::workers_mic_deviceid[STARPU_NMAXWORKERS]
			
 
				-If the flag starpu_conf::use_explicit_workers_mic_deviceid is set, the
			
 
				-array contains the logical identifiers of the MIC devices to be used.
			
 
				+    If the flag starpu_conf::use_explicit_workers_mic_deviceid is set,
			
 
				+    the array contains the logical identifiers of the MIC devices to
			
 
				+    be used.
			
 
				 \var unsigned starpu_conf::use_explicit_workers_scc_deviceid
			
 
				-If this flag is set, the SCC workers will be attached to the SCC
			
 
				-devices specified in the array starpu_conf::workers_scc_deviceid.
			
 
				-(default = 0)
			
 
				+    If this flag is set, the SCC workers will be attached to the SCC
			
 
				+    devices specified in the array starpu_conf::workers_scc_deviceid.
			
 
				+    (default = 0)
			
 
				 \var unsigned starpu_conf::workers_scc_deviceid[STARPU_NMAXWORKERS]
			
 
				-If the flag starpu_conf::use_explicit_workers_scc_deviceid is set, the
			
 
				-array contains the logical identifiers of the SCC devices to be used.
			
 
				-Otherwise, StarPU affects the SCC devices in a round-robin fashion.
			
 
				-This can also be specified with the environment variable
			
 
				-\ref STARPU_WORKERS_SCCID.
			
 
				+    If the flag starpu_conf::use_explicit_workers_scc_deviceid is set,
			
 
				+    the array contains the logical identifiers of the SCC devices to
			
 
				+    be used. Otherwise, StarPU affects the SCC devices in a
			
 
				+    round-robin fashion. This can also be specified with the
			
 
				+    environment variable \ref STARPU_WORKERS_SCCID.
			
 
				 \var unsigned starpu_conf::use_explicit_workers_mpi_ms_deviceid
			
 
				-If this flag is set, the MPI Master Slave workers will be attached to the MPI Master Slave
			
 
				-devices specified in the array starpu_conf::workers_mpi_ms_deviceid.
			
 
				-Otherwise, StarPU affects the MPI Master Slave devices in a round-robin fashion.
			
 
				-(default = 0)
			
 
				+    If this flag is set, the MPI Master Slave workers will be attached
			
 
				+    to the MPI Master Slave devices specified in the array
			
 
				+    starpu_conf::workers_mpi_ms_deviceid. Otherwise, StarPU affects
			
 
				+    the MPI Master Slave devices in a round-robin fashion. (default =
			
 
				+    0)
			
 
				 \var unsigned starpu_conf::workers_mpi_ms_deviceid[STARPU_NMAXWORKERS]
			
 
				-If the flag starpu_conf::use_explicit_workers_mpi_ms_deviceid is set, the
			
 
				-array contains the logical identifiers of the MPI Master Slave devices to be used.
			
 
				+    If the flag starpu_conf::use_explicit_workers_mpi_ms_deviceid is
			
 
				+    set, the array contains the logical identifiers of the MPI Master
			
 
				+    Slave devices to be used.
			
 
				 
			
 
				 \var int starpu_conf::bus_calibrate
			
 
				-If this flag is set, StarPU will recalibrate the bus.  If this value
			
 
				-is equal to <c>-1</c>, the default value is used.  This can also be
			
 
				-specified with the environment variable \ref STARPU_BUS_CALIBRATE. (default
			
 
				-= 0)
			
 
				+    If this flag is set, StarPU will recalibrate the bus.  If this
			
 
				+    value is equal to -1, the default value is used. This can
			
 
				+    also be specified with the environment variable \ref
			
 
				+    STARPU_BUS_CALIBRATE. (default = 0)
			
 
				 \var int starpu_conf::calibrate
			
 
				-If this flag is set, StarPU will calibrate the performance models when
			
 
				-executing tasks. If this value is equal to <c>-1</c>, the default
			
 
				-value is used. If the value is equal to <c>1</c>, it will force
			
 
				-continuing calibration. If the value is equal to <c>2</c>, the
			
 
				-existing performance models will be overwritten. This can also be
			
 
				-specified with the environment variable \ref STARPU_CALIBRATE. (default =
			
 
				-0)
			
 
				+    If this flag is set, StarPU will calibrate the performance models
			
 
				+    when executing tasks. If this value is equal to -1, the
			
 
				+    default value is used. If the value is equal to 1, it will
			
 
				+    force continuing calibration. If the value is equal to 2,
			
 
				+    the existing performance models will be overwritten. This can also
			
 
				+    be specified with the environment variable \ref STARPU_CALIBRATE.
			
 
				+    (default = 0)
			
 
				 \var int starpu_conf::single_combined_worker
			
 
				-By default, StarPU executes parallel tasks
			
 
				-concurrently. Some parallel libraries (e.g. most OpenMP
			
 
				-implementations) however do not support concurrent calls to
			
 
				-parallel code. In such case, setting this flag makes StarPU
			
 
				-only start one parallel task at a time (but other CPU and
			
 
				-GPU tasks are not affected and can be run concurrently).
			
 
				-The parallel task scheduler will however
			
 
				-still try varying combined worker sizes to look for the
			
 
				-most efficient ones. This can also be specified with the environment
			
 
				-variable \ref STARPU_SINGLE_COMBINED_WORKER.
			
 
				-(default = 0)
			
 
				+    By default, StarPU executes parallel tasks concurrently. Some
			
 
				+    parallel libraries (e.g. most OpenMP implementations) however do
			
 
				+    not support concurrent calls to parallel code. In such case,
			
 
				+    setting this flag makes StarPU only start one parallel task at a
			
 
				+    time (but other CPU and GPU tasks are not affected and can be run
			
 
				+    concurrently). The parallel task scheduler will however still try
			
 
				+    varying combined worker sizes to look for the most efficient ones.
			
 
				+    This can also be specified with the environment variable \ref
			
 
				+    STARPU_SINGLE_COMBINED_WORKER. (default = 0)
			
 
				 
			
 
				 \var char *starpu_conf::mic_sink_program_path
			
 
				-Path to the kernel to execute on the MIC device, compiled for MIC
			
 
				-architecture. When set to <c>NULL</c>, StarPU automatically looks next to the
			
 
				-host program location.
			
 
				-(default = <c>NULL</c>)
			
 
				+    Path to the kernel to execute on the MIC device, compiled for MIC
			
 
				+    architecture. When set to <c>NULL</c>, StarPU automatically looks
			
 
				+    next to the host program location. (default = <c>NULL</c>)
			
 
				 
			
 
				 \var int starpu_conf::disable_asynchronous_copy
			
 
				-This flag should be set to 1 to disable
			
 
				-asynchronous copies between CPUs and all accelerators. This
			
 
				-can also be specified with the environment variable
			
 
				-\ref STARPU_DISABLE_ASYNCHRONOUS_COPY. The
			
 
				-AMD implementation of OpenCL is known to fail when copying
			
 
				-data asynchronously. When using this implementation, it is
			
 
				-therefore necessary to disable asynchronous data transfers.
			
 
				-This can also be specified at compilation time by giving to
			
 
				-the configure script the option
			
 
				-\ref disable-asynchronous-copy "--disable-asynchronous-copy". (default = 0)
			
 
				+    This flag should be set to 1 to disable asynchronous copies
			
 
				+    between CPUs and all accelerators. This can also be specified with
			
 
				+    the environment variable \ref STARPU_DISABLE_ASYNCHRONOUS_COPY.
			
 
				+    The AMD implementation of OpenCL is known to fail when copying
			
 
				+    data asynchronously. When using this implementation, it is
			
 
				+    therefore necessary to disable asynchronous data transfers. This
			
 
				+    can also be specified at compilation time by giving to the
			
 
				+    configure script the option
			
 
				+    \ref disable-asynchronous-copy "--disable-asynchronous-copy".
			
 
				+    (default = 0)
			
 
				 \var int starpu_conf::disable_asynchronous_cuda_copy
			
 
				-This flag should be set to 1 to disable
			
 
				-asynchronous copies between CPUs and CUDA accelerators.
			
 
				-This can also be specified with the environment variable
			
 
				-\ref STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY.
			
 
				-This can also be specified at compilation time by giving to
			
 
				-the configure script the option
			
 
				-\ref disable-asynchronous-cuda-copy "--disable-asynchronous-cuda-copy". (default = 0)
			
 
				+    This flag should be set to 1 to disable asynchronous copies
			
 
				+    between CPUs and CUDA accelerators. This can also be specified
			
 
				+    with the environment variable \ref
			
 
				+    STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY.
			
 
				+    This can also be specified at compilation time by giving to the
			
 
				+    configure script the option
			
 
				+    \ref disable-asynchronous-cuda-copy "--disable-asynchronous-cuda-copy".
			
 
				+    (default = 0)
			
 
				 \var int starpu_conf::disable_asynchronous_opencl_copy
			
 
				-This flag should be set to 1 to disable
			
 
				-asynchronous copies between CPUs and OpenCL accelerators.
			
 
				-This can also be specified with the environment
			
 
				-variable \ref STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY. The AMD
			
 
				-implementation of OpenCL is known to fail
			
 
				-when copying data asynchronously. When using this
			
 
				-implementation, it is therefore necessary to disable
			
 
				-asynchronous data transfers. This can also be specified at
			
 
				-compilation time by giving to the configure script the
			
 
				-option \ref disable-asynchronous-opencl-copy "--disable-asynchronous-opencl-copy".
			
 
				-(default = 0)
			
 
				+    This flag should be set to 1 to disable asynchronous copies
			
 
				+    between CPUs and OpenCL accelerators. This can also be specified
			
 
				+    with the environment variable \ref
			
 
				+    STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY. The AMD implementation of
			
 
				+    OpenCL is known to fail when copying data asynchronously. When
			
 
				+    using this implementation, it is therefore necessary to disable
			
 
				+    asynchronous data transfers. This can also be specified at
			
 
				+    compilation time by giving to the configure script the
			
 
				+    option
			
 
				+    \ref disable-asynchronous-opencl-copy "--disable-asynchronous-opencl-copy".
			
 
				+    (default = 0)
			
 
				 \var int starpu_conf::disable_asynchronous_mic_copy
			
 
				-This flag should be set to 1 to disable asynchronous copies between
			
 
				-CPUs and MIC accelerators. This can also be specified with the
			
 
				-environment variable \ref STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY.
			
 
				-This can also be specified at compilation time by giving to the
			
 
				-configure script the option \ref disable-asynchronous-mic-copy "--disable-asynchronous-mic-copy".
			
 
				-(default = 0).
			
 
				+    This flag should be set to 1 to disable asynchronous copies
			
 
				+    between CPUs and MIC accelerators. This can also be specified with
			
 
				+    the environment variable \ref
			
 
				+    STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY. This can also be specified
			
 
				+    at compilation time by giving to the configure script the option
			
 
				+    \ref disable-asynchronous-mic-copy "--disable-asynchronous-mic-copy".
			
 
				+    (default = 0).
			
 
				 \var int starpu_conf::disable_asynchronous_mpi_ms_copy
			
 
				-This flag should be set to 1 to disable asynchronous copies between
			
 
				-CPUs and MPI Master Slave devices. This can also be specified with the
			
 
				-environment variable \ref STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY.
			
 
				-This can also be specified at compilation time by giving to the
			
 
				-configure script the option \ref disable-asynchronous-mpi-master-slave-copy "--disable-asynchronous-mpi-master-slave-copy".
			
 
				-(default = 0).
			
 
				+    This flag should be set to 1 to disable asynchronous copies
			
 
				+    between CPUs and MPI Master Slave devices. This can also be
			
 
				+    specified with the environment variable \ref
			
 
				+    STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY. This can also be
			
 
				+    specified at compilation time by giving to the configure script
			
 
				+    the option
			
 
				+    \ref disable-asynchronous-mpi-master-slave-copy "--disable-asynchronous-mpi-master-slave-copy".
			
 
				+    (default = 0).
			
 
				 
			
 
				 \var unsigned *starpu_conf::cuda_opengl_interoperability
			
 
				-Enable CUDA/OpenGL interoperation on these CUDA
			
 
				-devices. This can be set to an array of CUDA device
			
 
				-identifiers for which cudaGLSetGLDevice() should be called
			
 
				-instead of \c cudaSetDevice(). Its size is specified by the
			
 
				-starpu_conf::n_cuda_opengl_interoperability field below
			
 
				-(default = <c>NULL</c>)
			
 
				+    Enable CUDA/OpenGL interoperation on these CUDA devices. This can
			
 
				+    be set to an array of CUDA device identifiers for which
			
 
				+    \c cudaGLSetGLDevice() should be called instead of
			
 
				+    \c cudaSetDevice(). Its size is specified by the
			
 
				+    starpu_conf::n_cuda_opengl_interoperability field below
			
 
				+    (default = <c>NULL</c>)
			
 
				 \var unsigned starpu_conf::n_cuda_opengl_interoperability
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \var struct starpu_driver *starpu_conf::not_launched_drivers
			
 
				-Array of drivers that should not be launched by
			
 
				-StarPU. The application will run in one of its own
			
 
				-threads. (default = <c>NULL</c>)
			
 
				+    Array of drivers that should not be launched by StarPU. The
			
 
				+    application will run in one of its own threads. (default =
			
 
				+    <c>NULL</c>)
			
 
				 \var unsigned starpu_conf::n_not_launched_drivers
			
 
				-The number of StarPU drivers that should not be
			
 
				-launched by StarPU. (default = 0)
			
 
				+    The number of StarPU drivers that should not be launched by
			
 
				+    StarPU. (default = 0)
			
 
				+
			
 
				 \var starpu_conf::trace_buffer_size
			
 
				-Specifies the buffer size used for FxT tracing.
			
 
				-Starting from FxT version 0.2.12, the buffer will
			
 
				-automatically be flushed when it fills in, but it may still
			
 
				-be interesting to specify a bigger value to avoid any
			
 
				-flushing (which would disturb the trace).
			
 
				+    Specify the buffer size used for FxT tracing. Starting from FxT
			
 
				+    version 0.2.12, the buffer will automatically be flushed when it
			
 
				+    fills in, but it may still be interesting to specify a bigger
			
 
				+    value to avoid any flushing (which would disturb the trace).
			
 
				 
			
 
				 \var starpu_conf::global_sched_ctx_min_priority
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_conf::global_sched_ctx_max_priority
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \fn int starpu_init(struct starpu_conf *conf)
			
 
				 \ingroup API_Initialization_and_Termination
			
 
				 This is StarPU initialization method, which must be called prior to
			
 
				 any other StarPU call. It is possible to specify StarPU’s
			
 
				 configuration (e.g. scheduling policy, number of cores, ...) by
			
 
				-passing a non-<c>NULL</c> argument. Default configuration is used if the
			
 
				-passed argument is <c>NULL</c>. Upon successful completion, this function
			
 
				-returns 0. Otherwise, <c>-ENODEV</c> indicates that no worker was available
			
 
				-(so that StarPU was not initialized).
			
 
				+passing a non-<c>NULL</c> \p conf. Default configuration is used if \p
			
 
				+conf is <c>NULL</c>. Upon successful completion, this function
			
 
				+returns 0. Otherwise, <c>-ENODEV</c> indicates that no worker was
			
 
				+available (and thus StarPU was not initialized).
			
 
				 
			
 
				 \fn int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
			
 
				 \ingroup API_Initialization_and_Termination
			
@@ -242,12 +248,12 @@ same program.
 
				 
			
 
				 \fn int starpu_conf_init(struct starpu_conf *conf)
			
 
				 \ingroup API_Initialization_and_Termination
			
 
				-This function initializes the conf structure passed as argument with
			
 
				-the default values. In case some configuration parameters are already
			
 
				+Initialize the \p conf structure with the default values. In case some
			
 
				+configuration parameters are already
			
 
				 specified through environment variables, starpu_conf_init() initializes
			
 
				-the fields of the structure according to the environment variables.
			
 
				+the fields of \p conf according to the environment variables.
			
 
				 For instance if \ref STARPU_CALIBRATE is set, its value is put in the
			
 
				-field starpu_conf::calibrate of the structure passed as argument. Upon successful
			
 
				+field starpu_conf::calibrate of \p conf. Upon successful
			
 
				 completion, this function returns 0. Otherwise, <c>-EINVAL</c> indicates that
			
 
				 the argument was <c>NULL</c>.
			
 
				 
			
@@ -259,7 +265,7 @@ are not guaranteed to be available until this method has been called.
 
				 
			
 
				 \fn void starpu_pause(void)
			
 
				 \ingroup API_Initialization_and_Termination
			
 
				-This call is used to suspend the processing of new tasks by
			
 
				+Suspend the processing of new tasks by
			
 
				 workers. It can be used in a program where StarPU is used during only
			
 
				 a part of the execution. Without this call, the workers continue to
			
 
				 poll for new tasks in a tight loop, wasting CPU time. The symmetric
			
@@ -297,7 +303,6 @@ devices are disabled.
 
				 
			
 
				 \fn void starpu_topology_print(FILE *f)
			
 
				 \ingroup API_Initialization_and_Termination
			
 
				-Prints a description of the topology on f.
			
 
				+Print a description of the topology on \p f.
			
 
				 
			
 
				 */
			
 
				-
			
--- a/doc/doxygen/chapters/api/insert_task.doxy
+++ b/doc/doxygen/chapters/api/insert_task.doxy
@@ -10,12 +10,12 @@
 
				 
			
 
				 \fn int starpu_insert_task(struct starpu_codelet *cl, ...)
			
 
				 \ingroup API_Insert_Task
			
 
				-This function does the same as the function starpu_task_insert(). It has been kept to avoid breaking old codes.
			
 
				+Similar to starpu_task_insert(). Kept to avoid breaking old codes.
			
 
				 
			
 
				 \fn int starpu_task_insert(struct starpu_codelet *cl, ...)
			
 
				 \ingroup API_Insert_Task
			
 
				 Create and submit a task corresponding to \p cl with the
			
 
				-following arguments. The argument list must be zero-terminated.
			
 
				+following given arguments. The argument list must be zero-terminated.
			
 
				 
			
 
				 The arguments following the codelet can be of the following types:
			
 
				 <ul>
			
@@ -47,13 +47,13 @@ implementation to retrieve them.
 
				 
			
 
				 \def STARPU_VALUE
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must
			
 
				+Used when calling starpu_task_insert(), must
			
 
				 be followed by a pointer to a constant value and the size of the
			
 
				 constant
			
 
				 
			
 
				 \def STARPU_CL_ARGS
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must
			
 
				+Used when calling starpu_task_insert(), must
			
 
				 be followed by a memory buffer containing the arguments to be given to
			
 
				 the task, and by the size of the arguments. The memory buffer should
			
 
				 be the result of a previous call to starpu_codelet_pack_args(), and will be
			
@@ -61,12 +61,12 @@ freed (i.e. starpu_task::cl_arg_free will be set to 1)
 
				 
			
 
				 \def STARPU_CALLBACK
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must
			
 
				+Used when calling starpu_task_insert(), must
			
 
				 be followed by a pointer to a callback function
			
 
				 
			
 
				 \def STARPU_CALLBACK_WITH_ARG
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must
			
 
				+Used when calling starpu_task_insert(), must
			
 
				 be followed by two pointers: one to a callback function, and the other
			
 
				 to be given as an argument to the callback function; this is
			
 
				 equivalent to using both ::STARPU_CALLBACK and
			
@@ -74,13 +74,13 @@ equivalent to using both ::STARPU_CALLBACK and
 
				 
			
 
				 \def STARPU_CALLBACK_ARG
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must
			
 
				+Used when calling starpu_task_insert(), must
			
 
				 be followed by a pointer to be given as an argument to the callback
			
 
				 function
			
 
				 
			
 
				 \def STARPU_PRIORITY
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must
			
 
				+Used when calling starpu_task_insert(), must
			
 
				 be followed by a integer defining a priority level
			
 
				 
			
 
				 \def STARPU_DATA_ARRAY
			
@@ -93,42 +93,42 @@ TODO
 
				 
			
 
				 \def STARPU_EXECUTE_ON_WORKER
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must be
			
 
				+Used when calling starpu_task_insert(), must be
			
 
				 followed by an integer value specifying the worker on which to execute
			
 
				 the task (as specified by starpu_task::execute_on_a_specific_worker)
			
 
				 
			
 
				 \def STARPU_WORKER_ORDER
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must be
			
 
				+used when calling starpu_task_insert(), must be
			
 
				 followed by an integer value specifying the worker order in which to execute
			
 
				 the tasks (as specified by starpu_task::workerorder)
			
 
				 
			
 
				 \def STARPU_TAG
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must be followed by a tag.
			
 
				+Used when calling starpu_task_insert(), must be followed by a tag.
			
 
				 
			
 
				 \def STARPU_TAG_ONLY
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must be followed by a tag.
			
 
				-It sets starpu_task::tag_id, but leaves starpu_task::use_tag as 0.
			
 
				+Used when calling starpu_task_insert(), must be followed by a tag
			
 
				+stored in starpu_task::tag_id. Leave starpu_task::use_tag as 0.
			
 
				 
			
 
				 \def STARPU_NAME
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must be followed by a char *.
			
 
				-It sets starpu_task::name to it.
			
 
				+Used when calling starpu_task_insert(), must be followed by a char *
			
 
				+stored in starpu_task::name.
			
 
				 
			
 
				 \def STARPU_FLOPS
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must
			
 
				+Used when calling starpu_task_insert(), must
			
 
				 be followed by an amount of floating point operations, as a double.
			
 
				 Users <b>MUST</b> explicitly cast into double, otherwise parameter
			
 
				 passing will not work.
			
 
				 
			
 
				 \def STARPU_SCHED_CTX
			
 
				 \ingroup API_Insert_Task
			
 
				-this macro is used when calling starpu_task_insert(), and must
			
 
				-be followed by the id of the scheduling context to which we want to
			
 
				-submit the task.
			
 
				+Used when calling starpu_task_insert(), must
			
 
				+be followed by the id of the scheduling context to which to submit the
			
 
				+task to.
			
 
				 
			
 
				 \fn void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...)
			
 
				 \ingroup API_Insert_Task
			
@@ -146,7 +146,7 @@ parameters.
 
				 \fn void starpu_codelet_unpack_args_and_copyleft(void *cl_arg, void *buffer, size_t buffer_size, ...)
			
 
				 \ingroup API_Insert_Task
			
 
				 Similar to starpu_codelet_unpack_args(), but if any parameter is
			
 
				-0, copy the part of cl_arg that has not been read in buffer which
			
 
				+0, copy the part of \p cl_arg that has not been read in \p buffer which
			
 
				 can then be used in a later call to one of the unpack functions.
			
 
				 
			
 
				 \fn struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
			
--- a/doc/doxygen/chapters/api/lower_bound.doxy
+++ b/doc/doxygen/chapters/api/lower_bound.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2017  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -22,7 +22,7 @@ Stop recording tasks
 
				 
			
 
				 \fn void starpu_bound_print_dot(FILE *output)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				-Print the DAG that was recorded
			
 
				+Emit the DAG that was recorded on \p output.
			
 
				 
			
 
				 \fn void starpu_bound_compute(double *res, double *integer_res, int integer)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
@@ -42,7 +42,7 @@ tasks, in the mps format
 
				 
			
 
				 \fn void starpu_bound_print(FILE *output, int integer)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				-Emit statistics of actual execution vs theoretical upper bound.
			
 
				+Emit on \p output the statistics of actual execution vs theoretical upper bound.
			
 
				 \p integer permits to choose between integer solving (which takes a
			
 
				 long time but is correct), and relaxed solving (which provides an
			
 
				 approximate solution).
			
--- a/doc/doxygen/chapters/api/mic_extensions.doxy
+++ b/doc/doxygen/chapters/api/mic_extensions.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2017  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -10,13 +10,12 @@
 
				 
			
 
				 \def STARPU_USE_MIC
			
 
				 \ingroup API_MIC_Extensions
			
 
				-This macro is defined when StarPU has been installed with MIC support.
			
 
				+Defined when StarPU has been installed with MIC support.
			
 
				 It should be used in your code to detect the availability of MIC.
			
 
				 
			
 
				 \def STARPU_MAXMICDEVS
			
 
				 \ingroup API_MIC_Extensions
			
 
				-This macro defines the maximum number of MIC devices that are
			
 
				-supported by StarPU.
			
 
				+Define the maximum number of MIC devices that are supported by StarPU.
			
 
				 
			
 
				 \typedef starpu_mic_func_symbol_t
			
 
				 \ingroup API_MIC_Extensions
			
@@ -24,13 +23,13 @@ Type for MIC function symbols
 
				 
			
 
				 \fn int starpu_mic_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name)
			
 
				 \ingroup API_MIC_Extensions
			
 
				-Initiate a lookup on each MIC device to find the adress of the
			
 
				-function named \p func_name, store them in the global array kernels
			
 
				+Initiate a lookup on each MIC device to find the address of the
			
 
				+function named \p func_name, store it in the global array kernels
			
 
				 and return the index in the array through \p symbol.
			
 
				 
			
 
				 \fn starpu_mic_kernel_t starpu_mic_get_kernel(starpu_mic_func_symbol_t symbol)
			
 
				 \ingroup API_MIC_Extensions
			
 
				-If success, return the pointer to the function defined by \p symbol on
			
 
				+If successfull, return the pointer to the function defined by \p symbol on
			
 
				 the device linked to the called device. This can for instance be used
			
 
				 in a starpu_mic_func_t implementation.
			
 
				 
			
--- a/doc/doxygen/chapters/api/misc_helpers.doxy
+++ b/doc/doxygen/chapters/api/misc_helpers.doxy
@@ -20,16 +20,17 @@ the handle has been copied, and it is given the pointer \p callback_arg as argum
 
				 
			
 
				 \fn void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where)
			
 
				 \ingroup API_Miscellaneous_Helpers
			
 
				-This function executes the given function on a subset of workers. When
			
 
				+Execute the given function \p func on a subset of workers. When
			
 
				 calling this method, the offloaded function \p func is executed by
			
 
				-every StarPU worker that may execute the function. The argument \p arg
			
 
				+every StarPU worker that are eligible to execute the function.
			
 
				+The argument \p arg
			
 
				 is passed to the offloaded function. The argument \p where specifies
			
 
				 on which types of processing units the function should be executed.
			
 
				 Similarly to the field starpu_codelet::where, it is possible to
			
 
				 specify that the function should be executed on every CUDA device and
			
 
				 every CPU by passing ::STARPU_CPU|::STARPU_CUDA. This function blocks
			
 
				-until the function has been executed on every appropriate processing
			
 
				-units, so that it may not be called from a callback function for
			
 
				+until \p func has been executed on every appropriate processing
			
 
				+units, and thus may not be called from a callback function for
			
 
				 instance.
			
 
				 
			
 
				 \fn void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t where, const char *name)
			
--- a/doc/doxygen/chapters/api/modularized_scheduler.doxy
+++ b/doc/doxygen/chapters/api/modularized_scheduler.doxy
@@ -13,18 +13,16 @@
 
				 \ingroup API_Modularized_Scheduler
			
 
				 flags for starpu_sched_component::properties
			
 
				 \var starpu_sched_component_properties::STARPU_SCHED_COMPONENT_HOMOGENEOUS
			
 
				-\ingroup API_Modularized_Scheduler
			
 
				      indicate that all workers have the same starpu_worker_archtype
			
 
				 \var starpu_sched_component_properties::STARPU_SCHED_COMPONENT_SINGLE_MEMORY_NODE
			
 
				-\ingroup API_Modularized_Scheduler
			
 
				      indicate that all workers have the same memory component
			
 
				 
			
 
				 \def STARPU_SCHED_COMPONENT_IS_HOMOGENEOUS
			
 
				 \ingroup API_Modularized_Scheduler
			
 
				-     indicate if component is homogeneous
			
 
				+indicate if component is homogeneous
			
 
				 \def STARPU_SCHED_COMPONENT_IS_SINGLE_MEMORY_NODE
			
 
				 \ingroup API_Modularized_Scheduler
			
 
				-     indicate if all workers have the same memory component
			
 
				+indicate if all workers have the same memory component
			
 
				 
			
 
				 \struct starpu_sched_component
			
 
				 \ingroup API_Modularized_Scheduler
			
@@ -42,7 +40,7 @@ like <c>component->push_task(component,task)</c>
 
				 \var starpu_sched_component::workers_in_ctx
			
 
				      this member contain the subset of starpu_sched_component::workers that is currently available in the context
			
 
				      The push method should take this member into account.
			
 
				-     this member is set with :	
			
 
				+     this member is set with :
			
 
				      component->workers UNION tree->workers UNION
			
 
				      component->child[i]->workers_in_ctx iff exist x such as component->children[i]->parents[x] == component
			
 
				 \var void *starpu_sched_component::data
			
--- a/doc/doxygen/chapters/api/mpi.doxy
+++ b/doc/doxygen/chapters/api/mpi.doxy
@@ -13,13 +13,12 @@
 
				 
			
 
				 \def STARPU_USE_MPI
			
 
				 \ingroup API_MPI_Support
			
 
				-This macro is defined when StarPU has been installed with MPI
			
 
				-support. It should be used in your code to detect the availability of
			
 
				-MPI.
			
 
				+Defined when StarPU has been installed with MPI support. It should be
			
 
				+used in your code to detect the availability of MPI.
			
 
				 
			
 
				 \fn int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				-Initializes the starpumpi library with the given communicator.
			
 
				+Initialize the starpumpi library with the given communicator \p comm.
			
 
				 \p initialize_mpi indicates if MPI should be initialized or not by StarPU.
			
 
				 If the value is not 0, MPI will be initialized by calling
			
 
				 <c>MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED, ...)</c>.
			
@@ -46,8 +45,8 @@ calling <c>MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED,
 
				 
			
 
				 \fn int starpu_mpi_shutdown(void)
			
 
				 \ingroup API_MPI_Support
			
 
				-Cleans the starpumpi library. This must be called between calling
			
 
				-starpu_mpi functions and starpu_shutdown(). \c MPI_Finalize() will be
			
 
				+Clean the starpumpi library. This must be called between calling
			
 
				+\c starpu_mpi functions and starpu_shutdown(). \c MPI_Finalize() will be
			
 
				 called if StarPU-MPI has been initialized by starpu_mpi_init().
			
 
				 
			
 
				 \fn void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
			
@@ -79,33 +78,33 @@ Return the size of the communicator \c MPI_COMM_WORLD
 
				 
			
 
				 \fn int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				-Performs a standard-mode, blocking send of \p data_handle to the node
			
 
				+Perform a standard-mode, blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm.
			
 
				 
			
 
				 \fn int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
 
				-Performs a standard-mode, blocking receive in \p data_handle from the
			
 
				+Perform a standard-mode, blocking receive in \p data_handle from the
			
 
				 node \p source using the message tag \p mpi_tag within the
			
 
				 communicator \p comm.
			
 
				 
			
 
				 \fn int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				+Post a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm. After the call, the pointer to the request \p req can be used to
			
 
				 test or to wait for the completion of the communication.
			
 
				 
			
 
				 \fn int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				+Post a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm.
			
 
				 After the call, the pointer to the request \p req can be used to test
			
 
				 or to wait for the completion of the communication.
			
 
				 
			
 
				 \fn int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				+Post a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm. On completion, the \p callback function is called with the
			
 
				 argument \p arg.
			
@@ -116,7 +115,7 @@ of the request.
 
				 
			
 
				 \fn int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				+Post a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm. On
			
 
				 completion, the \p callback function is called with the argument \p
			
 
				 arg.
			
@@ -127,7 +126,7 @@ of the request.
 
				 
			
 
				 \fn int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				+Post a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm. On
			
 
				 completion, the \p callback function is called with the argument \p
			
 
				 arg.
			
@@ -143,13 +142,13 @@ of the request.
 
				 
			
 
				 \fn int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				-Performs a synchronous-mode, non-blocking send of \p data_handle to the node
			
 
				+Perform a synchronous-mode, non-blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm.
			
 
				 
			
 
				 \fn int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				-Performs a synchronous-mode, non-blocking send of \p data_handle to the node
			
 
				+Perform a synchronous-mode, non-blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm. On completion, the \p callback function is called with the argument \p
			
 
				 arg.
			
@@ -160,7 +159,7 @@ of the request.
 
				 
			
 
				 \fn int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
 
				-Returns when the operation identified by request \p req is complete.
			
 
				+Return when the operation identified by request \p req is complete.
			
 
				 
			
 
				 \fn int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
@@ -170,7 +169,7 @@ operation.
 
				 
			
 
				 \fn int starpu_mpi_barrier(MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				-Blocks the caller until all group members of the communicator \p comm
			
 
				+Block the caller until all group members of the communicator \p comm
			
 
				 have called it.
			
 
				 
			
 
				 \fn int starpu_mpi_wait_for_all(MPI_Comm comm)
			
@@ -179,19 +178,19 @@ Wait until all StarPU tasks and communications for the given communicator are co
 
				 
			
 
				 \fn int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				+Post a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm. On completion, \p tag is unlocked.
			
 
				 
			
 
				 \fn int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				+Post a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm. On
			
 
				 completion, \p tag is unlocked.
			
 
				 
			
 
				 \fn int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts \p array_size standard-mode, non blocking send. Each post sends
			
 
				+Post \p array_size standard-mode, non blocking send. Each post sends
			
 
				 the n-th data of the array \p data_handle to the n-th node of the
			
 
				 array \p dest using the n-th message tag of the array \p mpi_tag
			
 
				 within the n-th communicator of the array \p comm. On completion of
			
@@ -199,7 +198,7 @@ the all the requests, \p tag is unlocked.
 
				 
			
 
				 \fn int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				-Posts \p array_size nonblocking receive. Each post receives in the n-th
			
 
				+Post \p array_size nonblocking receive. Each post receives in the n-th
			
 
				 data of the array \p data_handle from the n-th node of the array \p
			
 
				 source using the n-th message tag of the array \p mpi_tag within the
			
 
				 n-th communicator of the array \p comm. On completion of the all the
			
@@ -327,25 +326,25 @@ flushes the cache for this data to avoid incoherencies.
 
				 
			
 
				 \def STARPU_EXECUTE_ON_NODE
			
 
				 \ingroup API_MPI_Support
			
 
				-this macro is used when calling starpu_mpi_task_insert(), and must be
			
 
				+Used when calling starpu_mpi_task_insert(), must be
			
 
				 followed by a integer value which specified the node on which to
			
 
				 execute the codelet.
			
 
				 
			
 
				 \def STARPU_EXECUTE_ON_DATA
			
 
				 \ingroup API_MPI_Support
			
 
				-this macro is used when calling starpu_mpi_task_insert(), and must be
			
 
				+Used when calling starpu_mpi_task_insert(), must be
			
 
				 followed by a data handle to specify that the node owning the given
			
 
				 data will execute the codelet.
			
 
				 
			
 
				 \def STARPU_NODE_SELECTION_POLICY
			
 
				 \ingroup API_MPI_Support
			
 
				-this macro is used when calling starpu_mpi_task_insert(), and must be
			
 
				+Used when calling starpu_mpi_task_insert(), must be
			
 
				 followed by a identifier to a node selection policy. This is needed when several
			
 
				 nodes own data in ::STARPU_W mode.
			
 
				 
			
 
				 \fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				 \ingroup API_MPI_Support
			
 
				-This function does the same as the function starpu_mpi_task_insert(). It has been kept to avoid breaking old codes.
			
 
				+Call starpu_mpi_task_insert(). Symbol kept for backward compatibility.
			
 
				 
			
 
				 \fn int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				 \ingroup API_MPI_Support
			
@@ -390,7 +389,7 @@ has been modified. The cache can be disabled (see \ref STARPU_MPI_CACHE).
 
				 
			
 
				 \fn struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				 \ingroup API_MPI_Support
			
 
				-Create a task corresponding to codelet with the following arguments.
			
 
				+Create a task corresponding to \p codelet with the following given arguments.
			
 
				 The argument list must be zero-terminated. The function performs the
			
 
				 first two steps of the function starpu_mpi_task_insert(). Only the MPI
			
 
				 node selected in the first step of the algorithm will return a valid
			
@@ -400,9 +399,9 @@ the task on the node which creates it, with the SAME list of arguments.
 
				 
			
 
				 \fn int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				 \ingroup API_MPI_Support
			
 
				-This function MUST be called after a call to starpu_mpi_task_build(),
			
 
				-with the SAME list of arguments. It performs the fourth -- last -- step of the algorithm described in
			
 
				-starpu_mpi_task_insert().
			
 
				+MUST be called after a call to starpu_mpi_task_build(),
			
 
				+with the SAME list of arguments. Perform the fourth -- last -- step of
			
 
				+the algorithm described in starpu_mpi_task_insert().
			
 
				 
			
 
				 \fn void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
			
 
				 \ingroup API_MPI_Support
			
@@ -442,7 +441,7 @@ data to be transfered.
 
				 
			
 
				 \fn int starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_policy_func_t policy_func)
			
 
				 \ingroup API_MPI_Support
			
 
				-Register a new policy which can then be used when there is several nodes owning data in W mode.
			
 
				+Register a new policy which can then be used when there is several nodes owning data in ::STARPU_W mode.
			
 
				 Here an example of function defining a node selection policy.
			
 
				 The codelet will be executed on the node owing the first data with a size bigger than 1M, or on the node
			
 
				 0 if no data fits the given size.
			
@@ -479,7 +478,7 @@ Unregister a previously registered policy.
 
				 
			
 
				 \fn void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 \ingroup API_MPI_Support
			
 
				-Perform a reduction on the given data. All nodes send the data to its
			
 
				+Perform a reduction on the given data \p handle. All nodes send the data to its
			
 
				 owner node which will perform a reduction.
			
 
				 
			
 
				 \fn int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
@@ -509,7 +508,7 @@ function is called with the argument \p sarg on any other process.
 
				 
			
 
				 \def STARPU_USE_MPI_MASTER_SLAVE
			
 
				 \ingroup API_MPI_Support
			
 
				-This macro is defined when StarPU has been installed with MPI Master Slave
			
 
				+Defined when StarPU has been installed with MPI Master Slave
			
 
				 support. It should be used in your code to detect the availability of
			
 
				 MPI Master Slave.
			
 
				 
			
--- a/doc/doxygen/chapters/api/multiformat_data_interface.doxy
+++ b/doc/doxygen/chapters/api/multiformat_data_interface.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -36,12 +36,19 @@ The different fields are:
 
				 todo
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				 \var enum starpu_data_interface_id starpu_multiformat_interface::id
			
 
				+    todo
			
 
				 \var void *starpu_multiformat_interface::cpu_ptr
			
 
				+    todo
			
 
				 \var void *starpu_multiformat_interface::cuda_ptr
			
 
				+    todo
			
 
				 \var void *starpu_multiformat_interface::opencl_ptr
			
 
				+    todo
			
 
				 \var void *starpu_multiformat_interface::mic_ptr
			
 
				+    todo
			
 
				 \var uint32_t starpu_multiformat_interface::nx
			
 
				+    todo
			
 
				 \var struct starpu_multiformat_data_interface_ops *starpu_multiformat_interface::ops
			
 
				+    todo
			
 
				 
			
 
				 \fn void starpu_multiformat_data_register(starpu_data_handle_t *handle, int home_node, void *ptr, uint32_t nobjects, struct starpu_multiformat_data_interface_ops *format_ops)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
@@ -54,22 +61,22 @@ describes the format.
 
				 
			
 
				 \def STARPU_MULTIFORMAT_GET_CPU_PTR(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				-returns the local pointer to the data with CPU format.
			
 
				+Return the local pointer to the data with CPU format.
			
 
				 
			
 
				 \def STARPU_MULTIFORMAT_GET_CUDA_PTR(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				-returns the local pointer to the data with CUDA format.
			
 
				+Return the local pointer to the data with CUDA format.
			
 
				 
			
 
				 \def STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				-returns the local pointer to the data with OpenCL format.
			
 
				+Return the local pointer to the data with OpenCL format.
			
 
				 
			
 
				 \def STARPU_MULTIFORMAT_GET_MIC_PTR(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				-returns the local pointer to the data with MIC format.
			
 
				+Return the local pointer to the data with MIC format.
			
 
				 
			
 
				 \def STARPU_MULTIFORMAT_GET_NX(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				-returns the number of elements in the data.
			
 
				+Return the number of elements in the data.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/api/opencl_extensions.doxy
+++ b/doc/doxygen/chapters/api/opencl_extensions.doxy
@@ -10,42 +10,42 @@
 
				 
			
 
				 \def STARPU_USE_OPENCL
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-This macro is defined when StarPU has been installed with
			
 
				+Defined when StarPU has been installed with
			
 
				 OpenCL support. It should be used in your code to detect the
			
 
				 availability of OpenCL as shown in \ref FullSourceCodeVectorScal.
			
 
				 
			
 
				 \def STARPU_MAXOPENCLDEVS
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-This macro defines the maximum number of OpenCL devices that are
			
 
				+Define the maximum number of OpenCL devices that are
			
 
				 supported by StarPU.
			
 
				 
			
 
				 \def STARPU_OPENCL_DATADIR
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-This macro defines the directory in which the OpenCL codelets of the
			
 
				+Define the directory in which the OpenCL codelets of the
			
 
				 applications provided with StarPU have been installed.
			
 
				 
			
 
				 \struct starpu_opencl_program
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Stores the OpenCL programs as compiled for the different OpenCL
			
 
				+Store the OpenCL programs as compiled for the different OpenCL
			
 
				 devices.
			
 
				 \var cl_program starpu_opencl_program::programs[STARPU_MAXOPENCLDEVS]
			
 
				-Stores each program for each OpenCL device.
			
 
				+    Store each program for each OpenCL device.
			
 
				 
			
 
				 @name Writing OpenCL kernels
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 
			
 
				 \fn void starpu_opencl_get_context(int devid, cl_context *context)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Places the OpenCL context of the device designated by \p devid
			
 
				-into \p context.
			
 
				+Return the OpenCL context of the device designated by \p devid
			
 
				+in \p context.
			
 
				 
			
 
				 \fn void starpu_opencl_get_device(int devid, cl_device_id *device)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Places the cl_device_id corresponding to \p devid in \p device.
			
 
				+Return the cl_device_id corresponding to \p devid in \p device.
			
 
				 
			
 
				 \fn void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Places the command queue of the device designated by \p devid
			
 
				+Return the command queue of the device designated by \p devid
			
 
				 into \p queue.
			
 
				 
			
 
				 \fn void starpu_opencl_get_current_context(cl_context *context)
			
@@ -59,12 +59,12 @@ worker.
 
				 
			
 
				 \fn int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Sets the arguments of a given kernel. The list of arguments
			
 
				+Set the arguments of a given kernel. The list of arguments
			
 
				 must be given as <c>(size_t size_of_the_argument, cl_mem *
			
 
				-pointer_to_the_argument)</c>. The last argument must be 0. Returns the
			
 
				+pointer_to_the_argument)</c>. The last argument must be 0. Return the
			
 
				 number of arguments that were successfully set. In case of failure,
			
 
				-returns the id of the argument that could not be set and err is set to
			
 
				-the error returned by OpenCL. Otherwise, returns the number of
			
 
				+return the id of the argument that could not be set and \p err is set to
			
 
				+the error returned by OpenCL. Otherwise, return the number of
			
 
				 arguments that were set.
			
 
				 
			
 
				 Here an example:
			
@@ -94,15 +94,15 @@ purpose for instance).
 
				 
			
 
				 \fn int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-This function compiles an OpenCL source code stored in a file.
			
 
				+Compile an OpenCL source code stored in a file.
			
 
				 
			
 
				 \fn int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-This function compiles an OpenCL source code stored in a string.
			
 
				+Compile an OpenCL source code stored in a string.
			
 
				 
			
 
				 \fn int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-This function unloads an OpenCL compiled code.
			
 
				+Unload an OpenCL compiled code.
			
 
				 
			
 
				 \fn void starpu_opencl_load_program_source(const char *source_file_name, char *located_file_name, char *located_dir_name, char *opencl_program_source)
			
 
				 \ingroup API_OpenCL_Extensions
			
@@ -119,12 +119,12 @@ string.
 
				 
			
 
				 \fn void starpu_opencl_load_program_source_malloc(const char *source_file_name, char **located_file_name, char **located_dir_name, char **opencl_program_source)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Similar to function starpu_opencl_load_program_source() but it allocates the buffers located_file_name, located_dir_name and opencl_program_source.
			
 
				+Similar to function starpu_opencl_load_program_source() but allocate the buffers \p located_file_name, \p located_dir_name and \p opencl_program_source.
			
 
				 
			
 
				 \fn int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Compile the OpenCL kernel stored in the file \p source_file_name
			
 
				-with the given options \p build_options and stores the result in the
			
 
				+with the given options \p build_options and store the result in the
			
 
				 directory <c>$STARPU_HOME/.starpu/opencl</c> with the same filename as
			
 
				 \p source_file_name. The compilation is done for every OpenCL device,
			
 
				 and the filename is suffixed with the vendor id and the device id of
			
@@ -133,7 +133,7 @@ the OpenCL device.
 
				 \fn int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Compile the OpenCL kernel in the string \p opencl_program_source
			
 
				-with the given options \p build_options and stores the result in the
			
 
				+with the given options \p build_options and store the result in the
			
 
				 directory <c>$STARPU_HOME/.starpu/opencl</c> with the filename \p
			
 
				 file_name. The compilation is done for every OpenCL device, and the
			
 
				 filename is suffixed with the vendor id and the device id of the
			
@@ -163,9 +163,9 @@ Release the given \p kernel, to be called after kernel execution.
 
				 
			
 
				 \fn int starpu_opencl_collect_stats(cl_event event)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-This function allows to collect statistics on a kernel execution.
			
 
				+Collect statistics on a kernel execution.
			
 
				 After termination of the kernels, the OpenCL codelet should call this
			
 
				-function to pass it the even returned by \c clEnqueueNDRangeKernel(), to
			
 
				+function with the event returned by \c clEnqueueNDRangeKernel(), to
			
 
				 let StarPU collect statistics about the kernel execution (used cycles,
			
 
				 consumed energy).
			
 
				 
			
@@ -179,13 +179,13 @@ error code.
 
				 
			
 
				 \fn void starpu_opencl_display_error(const char *func, const char *file, int line, const char *msg, cl_int status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Given a valid error status, prints the corresponding error message on
			
 
				-stdout, along with the given function name \p func, the given filename
			
 
				-\p file, the given line number \p line and the given message \p msg.
			
 
				+Given a valid error status, print the corresponding error message on
			
 
				+\c stdout, along with the function name \p func, the filename
			
 
				+\p file, the line number \p line and the message \p msg.
			
 
				 
			
 
				 \def STARPU_OPENCL_DISPLAY_ERROR(status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Call the function starpu_opencl_display_error() with the given error
			
 
				+Call the function starpu_opencl_display_error() with the error
			
 
				 \p status, the current function name, current file and line number,
			
 
				 and a empty message.
			
 
				 
			
@@ -195,15 +195,14 @@ Call the function starpu_opencl_display_error() and abort.
 
				 
			
 
				 \def STARPU_OPENCL_REPORT_ERROR(status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Call the function starpu_opencl_report_error() with the given error \p
			
 
				-status, with the current function name, current file and line number,
			
 
				+Call the function starpu_opencl_report_error() with the error \p
			
 
				+status, the current function name, current file and line number,
			
 
				 and a empty message.
			
 
				 
			
 
				 \def STARPU_OPENCL_REPORT_ERROR_WITH_MSG(msg, status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				-Call the function starpu_opencl_report_error() with the given \p msg
			
 
				-and the given error \p status, with the current function name, current
			
 
				-file and line number.
			
 
				+Call the function starpu_opencl_report_error() with \p msg
			
 
				+and \p status, the current function name, current file and line number.
			
 
				 
			
 
				 \fn cl_int starpu_opencl_allocate_memory(int devid, cl_mem *addr, size_t size, cl_mem_flags flags)
			
 
				 \ingroup API_OpenCL_Extensions
			
--- a/doc/doxygen/chapters/api/performance_model.doxy
+++ b/doc/doxygen/chapters/api/performance_model.doxy
@@ -12,46 +12,44 @@
 
				 \ingroup API_Performance_Model
			
 
				 TODO
			
 
				 \var starpu_perfmodel_type::STARPU_PERFMODEL_INVALID
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_perfmodel_type::STARPU_PER_ARCH
			
 
				-\ingroup API_Performance_Model
			
 
				-Application-provided per-arch cost model function
			
 
				+    Application-provided per-arch cost model function
			
 
				 \var starpu_perfmodel_type::STARPU_COMMON
			
 
				-\ingroup API_Performance_Model
			
 
				-Application-provided common cost model function, with per-arch factor
			
 
				+    Application-provided common cost model function, with per-arch
			
 
				+    factor
			
 
				 \var starpu_perfmodel_type::STARPU_HISTORY_BASED
			
 
				-\ingroup API_Performance_Model
			
 
				-Automatic history-based cost model
			
 
				+    Automatic history-based cost model
			
 
				 \var starpu_perfmodel_type::STARPU_REGRESSION_BASED
			
 
				-\ingroup API_Performance_Model
			
 
				-Automatic linear regression-based cost model  (alpha * size ^ beta)
			
 
				+    Automatic linear regression-based cost model  (alpha * size ^
			
 
				+    beta)
			
 
				 \var starpu_perfmodel_type::STARPU_NL_REGRESSION_BASED
			
 
				-\ingroup API_Performance_Model
			
 
				-Automatic non-linear regression-based cost model (a * size ^ b + c)
			
 
				+    Automatic non-linear regression-based cost model (a * size ^ b +
			
 
				+    c)
			
 
				 \var starpu_perfmodel_type::STARPU_MULTIPLE_REGRESSION_BASED
			
 
				-\ingroup API_Performance_Model
			
 
				-Automatic multiple linear regression-based cost model. Application provides parameters, their combinations and exponents
			
 
				+    Automatic multiple linear regression-based cost model. Application
			
 
				+    provides parameters, their combinations and exponents.
			
 
				 
			
 
				 \struct starpu_perfmodel_device
			
 
				 todo
			
 
				 \ingroup API_Performance_Model
			
 
				 \var enum starpu_worker_archtype starpu_perfmodel_device::type
			
 
				-is the type of the device
			
 
				+    type of the device
			
 
				 \var int starpu_perfmodel_device::devid
			
 
				-is the identifier of the precise device
			
 
				+    identifier of the precise device
			
 
				 \var int starpu_perfmodel_device::ncore
			
 
				-is the number of execution in parallel, minus 1
			
 
				+    number of execution in parallel, minus 1
			
 
				 
			
 
				 \struct starpu_perfmodel_arch
			
 
				 todo
			
 
				 \ingroup API_Performance_Model
			
 
				 \var int starpu_perfmodel_arch::ndevices
			
 
				-is the number of the devices for the given arch
			
 
				+    number of the devices for the given arch
			
 
				 \var struct starpu_perfmodel_device *starpu_perfmodel_arch::devices
			
 
				-is the list of the devices for the given arch
			
 
				+    list of the devices for the given arch
			
 
				 
			
 
				 \struct starpu_perfmodel
			
 
				-Contains all information about a performance model. At least the
			
 
				+Contain all information about a performance model. At least the
			
 
				 type and symbol fields have to be filled when defining a performance
			
 
				 model for a codelet. For compatibility, make sure to initialize the
			
 
				 whole structure to zero, either by using explicit memset, or by
			
@@ -59,161 +57,176 @@ letting the compiler implicitly do it in e.g. static storage case. If
 
				 not provided, other fields have to be zero.
			
 
				 \ingroup API_Performance_Model
			
 
				 \var enum starpu_perfmodel_type starpu_perfmodel::type
			
 
				-is the type of performance model
			
 
				-<ul>
			
 
				-<li>::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED,
			
 
				-::STARPU_NL_REGRESSION_BASED: No other fields needs to be provided,
			
 
				-this is purely history-based.
			
 
				-</li>
			
 
				-<li> ::STARPU_MULTIPLE_REGRESSION_BASED: Need to provide fields starpu_perfmodel::nparameters (number of different parameters),  starpu_perfmodel::ncombinations (number of parameters combinations-tuples) and table starpu_perfmodel::combinations which defines exponents of the equation. Function cl_perf_func also needs to define how to extract parameters from the task.
			
 
				-</li>
			
 
				-<li> ::STARPU_PER_ARCH: either field starpu_perfmodel::arch_cost_function has to be
			
 
				-filled with a function that returns the cost in micro-seconds on the arch given
			
 
				-as parameter, or field starpu_perfmodel::per_arch has to be
			
 
				-filled with functions which return the cost in micro-seconds.
			
 
				-</li>
			
 
				-<li> ::STARPU_COMMON: field starpu_perfmodel::cost_function has to be
			
 
				-filled with a function that returns the cost in micro-seconds on a
			
 
				-CPU, timing on other archs will be determined by multiplying by an
			
 
				-arch-specific factor.
			
 
				-</li>
			
 
				-</ul>
			
 
				+    type of performance model
			
 
				+    <ul>
			
 
				+    <li>
			
 
				+    ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED,
			
 
				+    ::STARPU_NL_REGRESSION_BASED: No other fields needs to be
			
 
				+    provided, this is purely history-based.
			
 
				+    </li>
			
 
				+    <li>
			
 
				+    ::STARPU_MULTIPLE_REGRESSION_BASED: Need to provide fields
			
 
				+    starpu_perfmodel::nparameters (number of different parameters),
			
 
				+    starpu_perfmodel::ncombinations (number of parameters
			
 
				+    combinations-tuples) and table starpu_perfmodel::combinations
			
 
				+    which defines exponents of the equation. Function cl_perf_func
			
 
				+    also needs to define how to extract parameters from the task. 
			
 
				+    </li>
			
 
				+    <li>
			
 
				+    ::STARPU_PER_ARCH: either field
			
 
				+    starpu_perfmodel::arch_cost_function has to be filled with a
			
 
				+    function that returns the cost in micro-seconds on the arch given
			
 
				+    as parameter, or field starpu_perfmodel::per_arch has to be filled
			
 
				+    with functions which return the cost in micro-seconds.
			
 
				+    </li>
			
 
				+    <li>
			
 
				+    ::STARPU_COMMON: field starpu_perfmodel::cost_function has to be
			
 
				+    filled with a function that returns the cost in micro-seconds on a
			
 
				+    CPU, timing on other archs will be determined by multiplying by an
			
 
				+    arch-specific factor.
			
 
				+    </li>
			
 
				+    </ul>
			
 
				 \var const char *starpu_perfmodel::symbol
			
 
				-is the symbol name for the performance model, which will be used as
			
 
				-file name to store the model. It must be set otherwise the model will
			
 
				-be ignored.
			
 
				+    symbol name for the performance model, which will be used as file
			
 
				+    name to store the model. It must be set otherwise the model will
			
 
				+    be ignored.
			
 
				 \var double (*starpu_perfmodel::cost_function)(struct starpu_task *, unsigned nimpl)
			
 
				-Used by ::STARPU_COMMON takes a task and implementation number, and
			
 
				-must return a task duration estimation in micro-seconds.
			
 
				+    Used by ::STARPU_COMMON. Take a task and implementation number,
			
 
				+    and must return a task duration estimation in micro-seconds.
			
 
				 \var double (*starpu_perfmodel::arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
 
				-Used by ::STARPU_COMMON takes a task, an arch and implementation number, and
			
 
				-must return a task duration estimation in micro-seconds on that arch.
			
 
				+    Used by ::STARPU_COMMON. Take a task, an arch and implementation
			
 
				+    number, and must return a task duration estimation in
			
 
				+    micro-seconds on that arch.
			
 
				 \var size_t (*starpu_perfmodel::size_base)(struct starpu_task *, unsigned nimpl)
			
 
				-Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
			
 
				-::STARPU_NL_REGRESSION_BASED. If not <c>NULL</c>, takes a task and
			
 
				-implementation number, and returns the size to be used as index to distinguish
			
 
				-histories and as a base for regressions.
			
 
				+    Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
			
 
				+    ::STARPU_NL_REGRESSION_BASED. If not <c>NULL</c>, take a task and
			
 
				+    implementation number, and return the size to be used as index to
			
 
				+    distinguish histories and as a base for regressions.
			
 
				 \var uint32_t (*starpu_perfmodel::footprint)(struct starpu_task *)
			
 
				-Used by ::STARPU_HISTORY_BASED. If not <c>NULL</c>, takes a task and returns the
			
 
				-footprint to be used as index to distinguish histories. The default is to use
			
 
				-the starpu_task_data_footprint() function.
			
 
				+    Used by ::STARPU_HISTORY_BASED. If not <c>NULL</c>, take a task
			
 
				+    and return the footprint to be used as index to distinguish
			
 
				+    histories. The default is to use the starpu_task_data_footprint()
			
 
				+    function.
			
 
				 \var unsigned starpu_perfmodel::is_loaded
			
 
				 \private
			
 
				-Whether the performance model is already loaded from the disk.
			
 
				+    Whether the performance model is already loaded from the disk.
			
 
				 \var unsigned starpu_perfmodel::benchmarking
			
 
				 \private
			
 
				-todo
			
 
				+    todo
			
 
				 \var unsigned starpu_perfmodel::is_init
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_perfmodel_state_t starpu_perfmodel::state
			
 
				 \private
			
 
				-todo
			
 
				+    todo
			
 
				 \var void (*starpu_perfmodel::parameters)(struct starpu_task * task, double *parameters);
			
 
				-todo
			
 
				+    todo
			
 
				 \var const char ** starpu_perfmodel::parameters_names
			
 
				 \private
			
 
				-Names of parameters used for multiple linear regression models (M, N, K)
			
 
				+    Names of parameters used for multiple linear regression models (M,
			
 
				+    N, K)
			
 
				 \var unsigned starpu_perfmodel::nparameters
			
 
				 \private
			
 
				-Number of parameters used for multiple linear regression models
			
 
				+    Number of parameters used for multiple linear regression models
			
 
				 \var unsigned ** starpu_perfmodel::combinations
			
 
				 \private
			
 
				-Table of combinations of parameters (and the exponents) used for multiple linear regression models
			
 
				+    Table of combinations of parameters (and the exponents) used for
			
 
				+    multiple linear regression models
			
 
				 \var unsigned starpu_perfmodel::ncombinations
			
 
				 \private
			
 
				-Number of combination of parameters used for multiple linear regression models
			
 
				-
			
 
				+    Number of combination of parameters used for multiple linear
			
 
				+    regression models
			
 
				 
			
 
				 \struct starpu_perfmodel_regression_model
			
 
				-...
			
 
				+todo
			
 
				 \ingroup API_Performance_Model
			
 
				 \var double starpu_perfmodel_regression_model::sumlny
			
 
				-sum of ln(measured)
			
 
				+    sum of ln(measured)
			
 
				 \var double starpu_perfmodel_regression_model::sumlnx
			
 
				-sum of ln(size)
			
 
				+    sum of ln(size)
			
 
				 \var double starpu_perfmodel_regression_model::sumlnx2
			
 
				-sum of ln(size)^2
			
 
				+    sum of ln(size)^2
			
 
				 \var unsigned long starpu_perfmodel_regression_model::minx
			
 
				-minimum size
			
 
				+    minimum size
			
 
				 \var unsigned long starpu_perfmodel_regression_model::maxx
			
 
				-maximum size
			
 
				+    maximum size
			
 
				 \var double starpu_perfmodel_regression_model::sumlnxlny
			
 
				-sum of ln(size)*ln(measured)
			
 
				+    sum of ln(size)*ln(measured)
			
 
				 \var double starpu_perfmodel_regression_model::alpha
			
 
				-estimated = alpha * size ^ beta
			
 
				+    estimated = alpha * size ^ beta
			
 
				 \var double starpu_perfmodel_regression_model::beta
			
 
				-estimated = alpha * size ^ beta
			
 
				+    estimated = alpha * size ^ beta
			
 
				 \var unsigned starpu_perfmodel_regression_model::valid
			
 
				-whether the linear regression model is valid (i.e. enough measures)
			
 
				+    whether the linear regression model is valid (i.e. enough measures)
			
 
				 \var double starpu_perfmodel_regression_model::a
			
 
				-estimated = a size ^b + c
			
 
				+    estimated = a size ^b + c
			
 
				 \var double starpu_perfmodel_regression_model::b
			
 
				-estimated = a size ^b + c
			
 
				+    estimated = a size ^b + c
			
 
				 \var double starpu_perfmodel_regression_model::c
			
 
				-estimated = a size ^b + c
			
 
				+    estimated = a size ^b + c
			
 
				 \var unsigned starpu_perfmodel_regression_model::nl_valid
			
 
				-whether the non-linear regression model is valid (i.e. enough measures)
			
 
				+    whether the non-linear regression model is valid (i.e. enough measures)
			
 
				 \var unsigned starpu_perfmodel_regression_model::nsample
			
 
				-number of sample values for non-linear regression
			
 
				+    number of sample values for non-linear regression
			
 
				 \var double starpu_perfmodel_regression_model::coeff[]
			
 
				-list of computed coefficients for multiple linear regression model
			
 
				+    list of computed coefficients for multiple linear regression model
			
 
				 \var double starpu_perfmodel_regression_model::ncoeff
			
 
				-number of coefficients for multiple linear regression model
			
 
				+    number of coefficients for multiple linear regression model
			
 
				 \var double starpu_perfmodel_regression_model::multi_valid
			
 
				-whether the multiple linear regression model is valid
			
 
				+    whether the multiple linear regression model is valid
			
 
				 
			
 
				 \struct starpu_perfmodel_per_arch
			
 
				 contains information about the performance model of a given
			
 
				 arch.
			
 
				 \ingroup API_Performance_Model
			
 
				 \var starpu_perfmodel_per_arch_cost_function starpu_perfmodel_per_arch::cost_function
			
 
				-Used by ::STARPU_PER_ARCH, must point to functions which take a task,
			
 
				-the target arch and implementation number (as mere conveniency, since
			
 
				-the array is already indexed by these), and must return a task
			
 
				-duration estimation in micro-seconds.
			
 
				+    Used by ::STARPU_PER_ARCH, must point to functions which take a
			
 
				+    task, the target arch and implementation number (as mere
			
 
				+    conveniency, since the array is already indexed by these), and
			
 
				+    must return a task duration estimation in micro-seconds.
			
 
				 \var starpu_perfmodel_per_arch_size_base starpu_perfmodel_per_arch::size_base
			
 
				-Same as in structure starpu_perfmodel, but per-arch, in case it
			
 
				-depends on the architecture-specific implementation.
			
 
				+    Same as in structure starpu_perfmodel, but per-arch, in case it
			
 
				+    depends on the architecture-specific implementation.
			
 
				 \var struct starpu_perfmodel_history_table *starpu_perfmodel_per_arch::history
			
 
				 \private
			
 
				-The history of performance measurements.
			
 
				+    The history of performance measurements.
			
 
				 \var struct starpu_perfmodel_history_list *starpu_perfmodel_per_arch::list
			
 
				 \private
			
 
				-Used by ::STARPU_HISTORY_BASED, ::STARPU_NL_REGRESSION_BASED and ::STARPU_MULTIPLE_REGRESSION_BASED,
			
 
				-records all execution history measures.
			
 
				+    Used by ::STARPU_HISTORY_BASED, ::STARPU_NL_REGRESSION_BASED and
			
 
				+    ::STARPU_MULTIPLE_REGRESSION_BASED, records all execution history
			
 
				+    measures.
			
 
				 \var struct starpu_perfmodel_regression_model starpu_perfmodel_per_arch::regression
			
 
				 \private
			
 
				-Used by ::STARPU_REGRESSION_BASED, 
			
 
				-::STARPU_NL_REGRESSION_BASED and ::STARPU_MULTIPLE_REGRESSION_BASED, contains the estimated factors of the
			
 
				-regression.
			
 
				+    Used by ::STARPU_REGRESSION_BASED, ::STARPU_NL_REGRESSION_BASED
			
 
				+    and ::STARPU_MULTIPLE_REGRESSION_BASED, contains the estimated
			
 
				+    factors of the regression.
			
 
				 
			
 
				 \struct starpu_perfmodel_history_list
			
 
				 todo
			
 
				 \ingroup API_Performance_Model
			
 
				 \var struct starpu_perfmodel_history_list *starpu_perfmodel_history_list::next
			
 
				-todo
			
 
				+    todo
			
 
				 \var struct starpu_perfmodel_history_entry *starpu_perfmodel_history_list::entry
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \struct starpu_perfmodel_history_entry
			
 
				 todo
			
 
				 \ingroup API_Performance_Model
			
 
				 \var double starpu_perfmodel_history_entry::mean
			
 
				-mean_n = 1/n sum
			
 
				+    mean_n = 1/n sum
			
 
				 \var double starpu_perfmodel_history_entry::deviation
			
 
				-n dev_n = sum2 - 1/n (sum)^2
			
 
				+    n dev_n = sum2 - 1/n (sum)^2
			
 
				 \var double starpu_perfmodel_history_entry::sum
			
 
				-sum of samples (in µs)
			
 
				+    sum of samples (in µs)
			
 
				 \var double starpu_perfmodel_history_entry::sum2
			
 
				-sum of samples^2
			
 
				+    sum of samples^2
			
 
				 \var unsigned starpu_perfmodel_history_entry::nsample
			
 
				-number of samples
			
 
				+    number of samples
			
 
				 \var uint32_t starpu_perfmodel_history_entry::footprint
			
 
				-data footprint
			
 
				+    data footprint
			
 
				 \var size_t starpu_perfmodel_history_entry::size
			
 
				-in bytes
			
 
				+    in bytes
			
 
				 \var double starpu_perfmodel_history_entry::flops
			
 
				-Provided by the application
			
 
				+    Provided by the application
			
 
				 
			
 
				 \fn void starpu_perfmodel_init(struct starpu_perfmodel *model)
			
 
				 \ingroup API_Performance_Model
			
@@ -221,31 +234,31 @@ todo
 
				 
			
 
				 \fn void starpu_perfmodel_free_sampling_directories(void)
			
 
				 \ingroup API_Performance_Model
			
 
				-this function frees internal memory used for sampling directory
			
 
				+Free internal memory used for sampling directory
			
 
				 management. It should only be called by an application which is not
			
 
				 calling starpu_shutdown() as this function already calls it. See for
			
 
				 example <c>tools/starpu_perfmodel_display.c</c>.
			
 
				 
			
 
				 \fn int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *model)
			
 
				 \ingroup API_Performance_Model
			
 
				-loads the performance model found in the given file. The model structure has to be
			
 
				+Load the performance model found in the file named \p filename. \p model has to be
			
 
				 completely zero, and will be filled with the information stored in the given file.
			
 
				 
			
 
				 \fn int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
			
 
				 \ingroup API_Performance_Model
			
 
				-loads a given performance model. The model structure has to be
			
 
				-completely zero, and will be filled with the information saved in
			
 
				+Load a given performance model. \p model has to be
			
 
				+completely zero, and will be filled with the information stored in
			
 
				 <c>$STARPU_HOME/.starpu</c>. The function is intended to be used by
			
 
				-external tools that should read the performance model files.
			
 
				+external tools that want to read the performance model files.
			
 
				 
			
 
				 \fn int starpu_perfmodel_unload_model(struct starpu_perfmodel *model)
			
 
				 \ingroup API_Performance_Model
			
 
				-unloads the given model which has been previously loaded
			
 
				+Unload \p model which has been previously loaded
			
 
				 through the function starpu_perfmodel_load_symbol()
			
 
				 
			
 
				 \fn void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl)
			
 
				 \ingroup API_Performance_Model
			
 
				-returns the path to the debugging information for the performance model.
			
 
				+Return the path to the debugging information for the performance model.
			
 
				 
			
 
				 \fn char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
			
 
				 \ingroup API_Performance_Model
			
@@ -253,19 +266,19 @@ todo
 
				 
			
 
				 \fn void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl)
			
 
				 \ingroup API_Performance_Model
			
 
				-returns the architecture name for \p arch
			
 
				+Return the architecture name for \p arch
			
 
				 
			
 
				 \fn struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsigned sched_ctx_id)
			
 
				 \ingroup API_Performance_Model
			
 
				-returns the architecture type of a given worker.
			
 
				+Return the architecture type of the worker \p workerid.
			
 
				 
			
 
				 \fn int starpu_perfmodel_list(FILE *output)
			
 
				 \ingroup API_Performance_Model
			
 
				-prints a list of all performance models on \p output
			
 
				+Print a list of all performance models on \p output
			
 
				 
			
 
				 \fn void starpu_perfmodel_directory(FILE *output)
			
 
				 \ingroup API_Performance_Model
			
 
				-prints the directory name storing performance models on \p output
			
 
				+Print the directory name storing performance models on \p output
			
 
				 
			
 
				 \fn void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
			
 
				 \ingroup API_Performance_Model
			
@@ -281,23 +294,23 @@ todo
 
				 
			
 
				 \fn void starpu_bus_print_bandwidth(FILE *f)
			
 
				 \ingroup API_Performance_Model
			
 
				-prints a matrix of bus bandwidths on \p f.
			
 
				+Print a matrix of bus bandwidths on \p f.
			
 
				 
			
 
				 \fn void starpu_bus_print_affinity(FILE *f)
			
 
				 \ingroup API_Performance_Model
			
 
				-prints the affinity devices on \p f.
			
 
				+Print the affinity devices on \p f.
			
 
				 
			
 
				 \fn void starpu_bus_print_filenames(FILE *f)
			
 
				 \ingroup API_Performance_Model
			
 
				-prints on \p f the name of the files containing the matrix of bus bandwidths, the affinity devices and the latency.
			
 
				+Print on \p f the name of the files containing the matrix of bus bandwidths, the affinity devices and the latency.
			
 
				 
			
 
				 \fn void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
			
 
				 \ingroup API_Performance_Model
			
 
				-This feeds the performance model model with an explicit
			
 
				+Feed the performance model model with an explicit
			
 
				 measurement measured (in µs), in addition to measurements done by StarPU
			
 
				 itself. This can be useful when the application already has an
			
 
				 existing set of measurements done in good conditions, that StarPU
			
 
				-could benefit from instead of doing on-line measurements. And example
			
 
				+could benefit from instead of doing on-line measurements. An example
			
 
				 of use can be seen in \ref PerformanceModelExample.
			
 
				 
			
 
				 \fn double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
			
--- a/doc/doxygen/chapters/api/profiling.doxy
+++ b/doc/doxygen/chapters/api/profiling.doxy
@@ -14,52 +14,52 @@ This structure contains information about the execution of a
 
				 task. It is accessible from the field starpu_task::profiling_info if
			
 
				 profiling was enabled.
			
 
				 \var struct timespec starpu_profiling_task_info::submit_time
			
 
				-Date of task submission (relative to the initialization of StarPU).
			
 
				+    Date of task submission (relative to the initialization of StarPU).
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::push_start_time
			
 
				-Time when the task was submitted to the scheduler.
			
 
				+    Time when the task was submitted to the scheduler.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::push_end_time
			
 
				-Time when the scheduler finished with the task submission.
			
 
				+    Time when the scheduler finished with the task submission.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::pop_start_time
			
 
				-Time when the scheduler started to be requested for a task, and eventually gave that task.
			
 
				+    Time when the scheduler started to be requested for a task, and eventually gave that task.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::pop_end_time
			
 
				-Time when the scheduler finished providing the task for execution.
			
 
				+    Time when the scheduler finished providing the task for execution.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::acquire_data_start_time
			
 
				-Time when the worker started fetching input data.
			
 
				+    Time when the worker started fetching input data.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::acquire_data_end_time
			
 
				-Time when the worker finished fetching input data.
			
 
				+    Time when the worker finished fetching input data.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::start_time
			
 
				-Date of task execution beginning (relative to the initialization of StarPU).
			
 
				+    Date of task execution beginning (relative to the initialization of StarPU).
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::end_time
			
 
				-Date of task execution termination (relative to the initialization of StarPU).
			
 
				+    Date of task execution termination (relative to the initialization of StarPU).
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::release_data_start_time
			
 
				-Time when the worker started releasing data.
			
 
				+    Time when the worker started releasing data.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::release_data_end_time
			
 
				-Time when the worker finished releasing data.
			
 
				+    Time when the worker finished releasing data.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::callback_start_time
			
 
				-Time when the worker started the application callback for the task.
			
 
				+    Time when the worker started the application callback for the task.
			
 
				 
			
 
				 \var struct timespec starpu_profiling_task_info::callback_end_time
			
 
				-Time when the worker finished the application callback for the task.
			
 
				+    Time when the worker finished the application callback for the task.
			
 
				 
			
 
				 \var int starpu_profiling_task_info::workerid
			
 
				-Identifier of the worker which has executed the task.
			
 
				+    Identifier of the worker which has executed the task.
			
 
				 
			
 
				 \var uint64_t starpu_profiling_task_info::used_cycles
			
 
				-Number of cycles used by the task, only available in the MoviSim
			
 
				+    Number of cycles used by the task, only available in the MoviSim
			
 
				 
			
 
				 \var uint64_t starpu_profiling_task_info::stall_cycles
			
 
				-Number of cycles stalled within the task, only available in the MoviSim
			
 
				+    Number of cycles stalled within the task, only available in the MoviSim
			
 
				 
			
 
				 \var double starpu_profiling_task_info::energy_consumed
			
 
				 Energy consumed by the task, only available in the MoviSim
			
@@ -100,18 +100,16 @@ todo
 
				 
			
 
				 \typedef STARPU_PROFILING_DISABLE
			
 
				 \ingroup API_Profiling
			
 
				-This value is used when calling the function
			
 
				-starpu_profiling_status_set() to disable profiling.
			
 
				+Used when calling the function starpu_profiling_status_set() to disable profiling.
			
 
				 
			
 
				 \typedef STARPU_PROFILING_ENABLE
			
 
				 \ingroup API_Profiling
			
 
				-This value is used when calling the function
			
 
				-starpu_profiling_status_set() to enable profiling.
			
 
				+Used when calling the function starpu_profiling_status_set() to enable profiling.
			
 
				 
			
 
				 \fn int starpu_profiling_status_set(int status)
			
 
				 \ingroup API_Profiling
			
 
				-This function sets the profiling status. Profiling is activated
			
 
				-by passing \ref STARPU_PROFILING_ENABLE in status. Passing
			
 
				+Set the profiling status. Profiling is activated
			
 
				+by passing \ref STARPU_PROFILING_ENABLE in \p status. Passing
			
 
				 \ref STARPU_PROFILING_DISABLE disables profiling. Calling this function
			
 
				 resets all profiling measurements. When profiling is enabled, the
			
 
				 field starpu_task::profiling_info points to a valid structure
			
@@ -126,13 +124,12 @@ there was an error.
 
				 
			
 
				 \fn void starpu_profiling_init(void)
			
 
				 \ingroup API_Profiling
			
 
				-This function resets performance counters and enable profiling if the
			
 
				+Reset performance counters and enable profiling if the
			
 
				 environment variable \ref STARPU_PROFILING is set to a positive value.
			
 
				 
			
 
				 \fn void starpu_profiling_set_id(int new_id)
			
 
				 \ingroup API_Profiling
			
 
				-This function sets the ID used for profiling trace filename. It
			
 
				-needs to be called before starpu_init().
			
 
				+Set the ID used for profiling trace filename. HAS to be called before starpu_init().
			
 
				 
			
 
				 \fn int starpu_profiling_worker_get_info(int workerid, struct starpu_profiling_worker_info *worker_info)
			
 
				 \ingroup API_Profiling
			
@@ -164,21 +161,21 @@ Return the destination point of bus \p busid
 
				 
			
 
				 \fn double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end)
			
 
				 \ingroup API_Profiling
			
 
				-Returns the time elapsed between \p start and \p end in microseconds.
			
 
				+Return the time elapsed between \p start and \p end in microseconds.
			
 
				 
			
 
				 \fn double starpu_timing_timespec_to_us(struct timespec *ts)
			
 
				 \ingroup API_Profiling
			
 
				-Converts the given timespec \p ts into microseconds
			
 
				+Convert the given timespec \p ts into microseconds
			
 
				 
			
 
				 \fn void starpu_profiling_bus_helper_display_summary(void)
			
 
				 \ingroup API_Profiling
			
 
				-Displays statistics about the bus on stderr. if the environment
			
 
				+Display statistics about the bus on \c stderr. if the environment
			
 
				 variable \ref STARPU_BUS_STATS is defined. The function is called
			
 
				 automatically by starpu_shutdown().
			
 
				 
			
 
				 \fn void starpu_profiling_worker_helper_display_summary(void)
			
 
				 \ingroup API_Profiling
			
 
				-Displays statistics about the workers on stderr if the
			
 
				+Displays statistic about the workers on \c stderr if the
			
 
				 environment variable \ref STARPU_WORKER_STATS is defined. The function is
			
 
				 called automatically by starpu_shutdown().
			
 
				 
			
--- a/doc/doxygen/chapters/api/running_driver.doxy
+++ b/doc/doxygen/chapters/api/running_driver.doxy
@@ -12,38 +12,38 @@
 
				 structure for a driver
			
 
				 \ingroup API_Running_Drivers
			
 
				 \var enum starpu_worker_archtype starpu_driver::type
			
 
				-The type of the driver. Only ::STARPU_CPU_WORKER,
			
 
				-::STARPU_CUDA_WORKER and ::STARPU_OPENCL_WORKER are currently supported.
			
 
				+    Type of the driver. Only ::STARPU_CPU_WORKER, ::STARPU_CUDA_WORKER
			
 
				+    and ::STARPU_OPENCL_WORKER are currently supported.
			
 
				 \var union starpu_driver::id
			
 
				-The identifier of the driver.
			
 
				+    Identifier of the driver.
			
 
				 
			
 
				 \fn int starpu_driver_run(struct starpu_driver *d)
			
 
				 \ingroup API_Running_Drivers
			
 
				 Initialize the given driver, run it until it receives a request to
			
 
				-terminate, deinitialize it and return 0 on success. It returns
			
 
				+terminate, deinitialize it and return 0 on success. Return
			
 
				 <c>-EINVAL</c> if starpu_driver::type is not a valid StarPU device type
			
 
				 (::STARPU_CPU_WORKER, ::STARPU_CUDA_WORKER or ::STARPU_OPENCL_WORKER).
			
 
				 
			
 
				 This is the same as using the following functions: calling
			
 
				 starpu_driver_init(), then calling starpu_driver_run_once() in a loop,
			
 
				-and eventually starpu_driver_deinit().
			
 
				+and finally starpu_driver_deinit().
			
 
				 
			
 
				 \fn int starpu_driver_init(struct starpu_driver *d)
			
 
				 \ingroup API_Running_Drivers
			
 
				-Initialize the given driver. Returns 0 on success, <c>-EINVAL</c> if
			
 
				-starpu_driver::type is not a valid ::starpu_worker_archtype.
			
 
				+Initialize the given driver. Return 0 on success, <c>-EINVAL</c>
			
 
				+if starpu_driver::type is not a valid ::starpu_worker_archtype.
			
 
				 
			
 
				 \fn int starpu_driver_run_once(struct starpu_driver *d)
			
 
				 \ingroup API_Running_Drivers
			
 
				-Run the driver once, then returns 0 on success, <c>-EINVAL</c> if starpu_driver::type is not a valid ::starpu_worker_archtype.
			
 
				+Run the driver once, then return 0 on success, <c>-EINVAL</c> if starpu_driver::type is not a valid ::starpu_worker_archtype.
			
 
				 
			
 
				 \fn int starpu_driver_deinit(struct starpu_driver *d)
			
 
				 \ingroup API_Running_Drivers
			
 
				-Deinitialize the given driver. Returns 0 on success, <c>-EINVAL</c> if
			
 
				+Deinitialize the given driver. Return 0 on success, <c>-EINVAL</c> if
			
 
				 starpu_driver::type is not a valid ::starpu_worker_archtype.
			
 
				 
			
 
				 \fn void starpu_drivers_request_termination(void)
			
 
				 \ingroup API_Running_Drivers
			
 
				-Notify all running drivers they should terminate.
			
 
				+Notify all running drivers that they should terminate.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/api/scc_extensions.doxy
+++ b/doc/doxygen/chapters/api/scc_extensions.doxy
@@ -10,12 +10,12 @@
 
				 
			
 
				 \def STARPU_USE_SCC
			
 
				 \ingroup API_SCC_Extensions
			
 
				-This macro is defined when StarPU has been installed with SCC support.
			
 
				+Defined when StarPU has been installed with SCC support.
			
 
				 It should be used in your code to detect the availability of SCC.
			
 
				 
			
 
				 \def STARPU_MAXSCCDEVS
			
 
				 \ingroup API_SCC_Extensions
			
 
				-This macro defines the maximum number of SCC devices that are
			
 
				+Define the maximum number of SCC devices that are
			
 
				 supported by StarPU.
			
 
				 
			
 
				 \typedef starpu_scc_func_symbol_t
			
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * Copyright (C) 2016 Uppsala University
			
 
				  * See the file version.doxy for copying conditions.
			
@@ -22,17 +22,17 @@ Performance counters used by the starpu to indicate the
 
				 hypervisor how the application and the resources are executing.
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 \var void (*starpu_sched_ctx_performance_counters::notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)
			
 
				-        Informs the hypervisor for how long a worker has been idle in the specified context
			
 
				+        Inform the hypervisor for how long a worker has been idle in the specified context
			
 
				 \var void (*starpu_sched_ctx_performance_counters::notify_pushed_task)(unsigned sched_ctx_id, int worker)
			
 
				-        Notifies the hypervisor that a task has been scheduled on the queue of the worker corresponding to the specified context
			
 
				+        Notify the hypervisor that a task has been scheduled on the queue of the worker corresponding to the specified context
			
 
				 \var void (*starpu_sched_ctx_performance_counters::notify_poped_task)(unsigned sched_ctx_id, int worker)
			
 
				-        Informs the hypervisor that a task executing a specified number of instructions has been poped from the worker
			
 
				+        Inform the hypervisor that a task executing a specified number of instructions has been poped from the worker
			
 
				 \var void (*starpu_sched_ctx_performance_counters::notify_post_exec_task)(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag, double flops)
			
 
				-        Notifies the hypervisor that a task has just been executed
			
 
				+        Notify the hypervisor that a task has just been executed
			
 
				 \var void (*starpu_sched_ctx_performance_counters::notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size)
			
 
				-        Notifies the hypervisor that a task has just been submitted
			
 
				+        Notify the hypervisor that a task has just been submitted
			
 
				 \var void (*starpu_sched_ctx_performance_counters::notify_delete_context)(unsigned sched_ctx)
			
 
				-        Notifies the hypervisor that the context was deleted
			
 
				+        Notify the hypervisor that the context was deleted
			
 
				 
			
 
				 
			
 
				 @name Scheduling Contexts Basic API
			
@@ -45,7 +45,7 @@ modified at configure by using the option \ref enable-max-sched-ctxs "--enable-m
 
				 
			
 
				 \fn unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This function creates a scheduling context with the given parameters
			
 
				+Create a scheduling context with the given parameters
			
 
				 (see below) and assigns the workers in \p workerids_ctx to execute the
			
 
				 tasks submitted to it. The return value represents the identifier of
			
 
				 the context that has just been created. It will be further used to
			
@@ -79,47 +79,47 @@ to a custom user data structure, to be retrieved by \ref starpu_sched_ctx_get_us
 
				 
			
 
				 \def STARPU_SCHED_CTX_POLICY_NAME
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				+Used when calling starpu_sched_ctx_create() to specify a
			
 
				 name for a scheduling policy
			
 
				 
			
 
				 \def STARPU_SCHED_CTX_POLICY_STRUCT
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				+Used when calling starpu_sched_ctx_create() to specify a
			
 
				 pointer to a scheduling policy
			
 
				 
			
 
				 \def STARPU_SCHED_CTX_POLICY_MIN_PRIO
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				+Used when calling starpu_sched_ctx_create() to specify a
			
 
				 minimum scheduler priority value.
			
 
				 
			
 
				 \def STARPU_SCHED_CTX_POLICY_MAX_PRIO
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				+Used when calling starpu_sched_ctx_create() to specify a
			
 
				 maximum scheduler priority value.
			
 
				 
			
 
				 \def STARPU_SCHED_CTX_AWAKE_WORKERS
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				+Used when calling starpu_sched_ctx_create() to specify a
			
 
				 pointer to a scheduling policy
			
 
				 
			
 
				 \def STARPU_SCHED_CTX_POLICY_INIT
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				+Used when calling starpu_sched_ctx_create() to specify a
			
 
				 function pointer allowing to initialize the scheduling policy.
			
 
				 
			
 
				 \def STARPU_SCHED_CTX_USER_DATA
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				+Used when calling starpu_sched_ctx_create() to specify a
			
 
				 pointer to some user data related to the context being created.
			
 
				 
			
 
				 \def STARPU_SCHED_CTX_SUB_CTXS
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() to specify 
			
 
				+Used when calling starpu_sched_ctx_create() to specify
			
 
				 a list of sub contextes of the current context.
			
 
				 
			
 
				 \def STARPU_SCHED_CTX_CUDA_NSMS
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This macro is used when calling starpu_sched_ctx_create() in order
			
 
				+Used when calling starpu_sched_ctx_create() in order
			
 
				 to create a context on the NVIDIA GPU to specify the number of SMs
			
 
				 the context should have
			
 
				 
			
@@ -129,24 +129,24 @@ Create a context indicating an approximate interval of resources
 
				 
			
 
				 \fn void starpu_sched_ctx_register_close_callback(unsigned sched_ctx_id, void (*close_callback)(unsigned sched_ctx_id, void* args), void *args)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Execute the callback whenever the last task of the context finished executing, it is called with the pramaters: sched_ctx and any other paramter needed
			
 
				-by the application (packed in a void*)
			
 
				+Execute the callback whenever the last task of the context finished executing, it is called with the parameters \p sched_ctx and any other parameter needed
			
 
				+by the application (packed in \p args)
			
 
				 
			
 
				 \fn void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This function adds dynamically the workers in \p workerids_ctx to the
			
 
				+Add dynamically the workers in \p workerids_ctx to the
			
 
				 context \p sched_ctx_id. The last argument cannot be greater than
			
 
				 \ref STARPU_NMAX_SCHED_CTXS.
			
 
				 
			
 
				 \fn void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This function removes the workers in \p workerids_ctx from the context
			
 
				+Remove the workers in \p workerids_ctx from the context
			
 
				 \p sched_ctx_id. The last argument cannot be greater than
			
 
				 STARPU_NMAX_SCHED_CTXS.
			
 
				 
			
 
				 \fn void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-This function prints on the file \p f the worker names belonging to the context \p sched_ctx_id
			
 
				+Print on the file \p f the worker names belonging to the context \p sched_ctx_id
			
 
				 
			
 
				 \fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
@@ -181,13 +181,13 @@ possible.
 
				 
			
 
				 \fn unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Returns the list of workers in the array \p workerids, the returned value is the 
			
 
				+Return the list of workers in the array \p workerids, the returned value is the
			
 
				 number of workers. The user should free the \p workerids table after finishing
			
 
				 using it (it is allocated inside the function with the proper size)
			
 
				 
			
 
				 \fn unsigned starpu_sched_ctx_get_workers_list_raw(unsigned sched_ctx_id, int **workerids)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Returns the list of workers in the array \p workerids, the returned value is the 
			
 
				+Return the list of workers in the array \p workerids, the returned value is the
			
 
				 number of workers. This list is provided in raw order, i.e. not sorted by tree or list order,
			
 
				 and the user should not free the \p workerids table.
			
 
				 This function is thus much less costly than starpu_sched_ctx_get_workers_list.
			
@@ -200,7 +200,7 @@ blocked)
 
				 
			
 
				 \fn unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-    Return the number of workers shared by two contexts.
			
 
				+Return the number of workers shared by two contexts.
			
 
				 
			
 
				 \fn unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
@@ -234,7 +234,7 @@ statically allocate tasks with a default priority.
 
				 
			
 
				 \fn int starpu_sched_ctx_set_min_priority(unsigned sched_ctx_id, int min_prio)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Defines the minimum task priority level supported by the scheduling
			
 
				+Define the minimum task priority level supported by the scheduling
			
 
				 policy of the given scheduler context. The default minimum priority
			
 
				 level is the same as the default priority level which is 0 by
			
 
				 convention. The application may access that value by calling the function
			
@@ -244,7 +244,7 @@ should not be used directly from the application.
 
				 
			
 
				 \fn int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Defines the maximum priority level supported by the scheduling policy
			
 
				+Define the maximum priority level supported by the scheduling policy
			
 
				 of the given scheduler context. The default maximum priority level is
			
 
				 1. The application may access that value by calling the
			
 
				 starpu_sched_ctx_get_max_priority function. This function should only
			
@@ -253,12 +253,12 @@ should not be used directly from the application.
 
				 
			
 
				 \fn int starpu_sched_ctx_get_min_priority(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Returns the current minimum priority level supported by the scheduling
			
 
				+Return the current minimum priority level supported by the scheduling
			
 
				 policy of the given scheduler context.
			
 
				 
			
 
				 \fn int starpu_sched_ctx_get_max_priority(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Returns the current maximum priority level supported by the scheduling
			
 
				+Return the current maximum priority level supported by the scheduling
			
 
				 policy of the given scheduler context.
			
 
				 
			
 
				 \fn int starpu_sched_ctx_min_priority_is_set(unsigned sched_ctx_id)
			
@@ -294,7 +294,7 @@ Return the worker collection managed by the indicated context
 
				 
			
 
				 \fn void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, void *perf_counters)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Indicates to starpu the pointer to the performance counter
			
 
				+Indicate to starpu the pointer to the performance counter
			
 
				 
			
 
				 \fn void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
@@ -316,12 +316,12 @@ additional condition variables) the context
 
				 
			
 
				 \fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-Return the scheduling policy data (private information of the scheduler) of the contexts previously 
			
 
				+Return the scheduling policy data (private information of the scheduler) of the contexts previously
			
 
				 assigned to.
			
 
				 
			
 
				 \fn void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				-execute any parallel code on the workers of the sched_ctx (workers are blocked)
			
 
				+Execute any parallel code on the workers of the sched_ctx (workers are blocked)
			
 
				 
			
 
				 \fn int starpu_sched_ctx_get_nready_tasks(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
--- a/doc/doxygen/chapters/api/scheduling_policy.doxy
+++ b/doc/doxygen/chapters/api/scheduling_policy.doxy
@@ -20,7 +20,7 @@ configure by using the option \ref enable-maximplementations "--enable-maximplem
 
				 
			
 
				 \struct starpu_sched_policy
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-This structure contains all the methods that implement a
			
 
				+Contain all the methods that implement a
			
 
				 scheduling policy. An application may specify which scheduling
			
 
				 strategy in the field starpu_conf::sched_policy passed to the function
			
 
				 starpu_init().
			
@@ -30,9 +30,9 @@ For each task going through the scheduler, the following methods get called in t
 
				 <ul>
			
 
				 <li>starpu_sched_policy::submit_hook when the task is submitted</li>
			
 
				 <li>starpu_sched_policy::push_task when the task becomes ready. The scheduler is here <b>given</b> the task</li>
			
 
				-<li>starpu_sched_policy::pop_task when a worker is idle. The scheduler here <b>gives</b> back the task to the core</li>
			
 
				+<li>starpu_sched_policy::pop_task when the worker is idle. The scheduler here <b>gives</b> back the task to the core</li>
			
 
				 <li>starpu_sched_policy::pre_exec_hook right before the worker actually starts the task computation (after transferring any missing data).</li>
			
 
				-<li>starpu_sched_policy::post_exec_hook right after the worker actually completed the task computation.</li>
			
 
				+<li>starpu_sched_policy::post_exec_hook right after the worker actually completes the task computation.</li>
			
 
				 </ul>
			
 
				 
			
 
				 For each task not going through the scheduler (because starpu_task::execute_on_a_specific_worker was set), these get called:
			
@@ -41,7 +41,7 @@ For each task not going through the scheduler (because starpu_task::execute_on_a
 
				 <li>starpu_sched_policy::submit_hook when the task is submitted</li>
			
 
				 <li>starpu_sched_policy::push_task_notify when the task becomes ready. This is just a notification, the scheduler does not have to do anything about the task.</li>
			
 
				 <li>starpu_sched_policy::pre_exec_hook right before the worker actually starts the task computation (after transferring any missing data).</li>
			
 
				-<li>starpu_sched_policy::post_exec_hook right after the worker actually completed the task computation.</li>
			
 
				+<li>starpu_sched_policy::post_exec_hook right after the worker actually completes the task computation.</li>
			
 
				 </ul>
			
 
				 
			
 
				 
			
@@ -110,7 +110,7 @@ block and wake up all workers.
 
				 \fn int starpu_sched_set_min_priority(int min_prio)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 TODO: check if this is correct
			
 
				-Defines the minimum task priority level supported by the scheduling
			
 
				+Define the minimum task priority level supported by the scheduling
			
 
				 policy. The default minimum priority level is the same as the default
			
 
				 priority level which is 0 by convention. The application may access
			
 
				 that value by calling the function starpu_sched_get_min_priority().
			
@@ -121,7 +121,7 @@ application.
 
				 \fn int starpu_sched_set_max_priority(int max_prio)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 TODO: check if this is correct
			
 
				-Defines the maximum priority level supported by the scheduling policy.
			
 
				+Define the maximum priority level supported by the scheduling policy.
			
 
				 The default maximum priority level is 1. The application may access
			
 
				 that value by calling the function starpu_sched_get_max_priority().
			
 
				 This function should only be called from the initialization method of
			
@@ -131,13 +131,13 @@ application.
 
				 \fn int starpu_sched_get_min_priority(void)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 TODO: check if this is correct
			
 
				-Returns the current minimum priority level supported by the scheduling
			
 
				+Return the current minimum priority level supported by the scheduling
			
 
				 policy
			
 
				 
			
 
				 \fn int starpu_sched_get_max_priority(void)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 TODO: check if this is correct
			
 
				-Returns the current maximum priority level supported by the scheduling
			
 
				+Return the current maximum priority level supported by the scheduling
			
 
				 policy
			
 
				 
			
 
				 \fn int starpu_push_local_task(int workerid, struct starpu_task *task, int back)
			
@@ -150,7 +150,7 @@ Setting \p back to 0 therefore ensures a FIFO ordering.
 
				 
			
 
				 \fn int starpu_push_task_end(struct starpu_task *task)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-This function must be called by a scheduler to notify that the given
			
 
				+Must be called by a scheduler to notify that the given
			
 
				 task has just been pushed.
			
 
				 
			
 
				 \fn int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
@@ -181,24 +181,24 @@ check for at least one implementation without determining which.
 
				 
			
 
				 \fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-Returns the footprint for a given task, taking into account user-provided
			
 
				+Return the footprint for a given task, taking into account user-provided
			
 
				 perfmodel footprint or size_base functions.
			
 
				 
			
 
				 \fn uint32_t starpu_task_data_footprint(struct starpu_task *task)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-Returns the raw footprint for the data of a given task (without taking into account user-provided functions).
			
 
				+Return the raw footprint for the data of a given task (without taking into account user-provided functions).
			
 
				 
			
 
				 \fn double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-Returns expected task duration in micro-seconds.
			
 
				+Return expected task duration in micro-seconds.
			
 
				 
			
 
				 \fn double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-Returns an estimated speedup factor relative to CPU speed
			
 
				+Return an estimated speedup factor relative to CPU speed
			
 
				 
			
 
				 \fn double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-Returns expected data transfer time in micro-seconds.
			
 
				+Return expected data transfer time in micro-seconds.
			
 
				 
			
 
				 \fn double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_data_access_mode mode)
			
 
				 \ingroup API_Scheduling_Policy
			
@@ -206,11 +206,11 @@ Predict the transfer time (in micro-seconds) to move \p handle to a memory node
 
				 
			
 
				 \fn double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-Returns expected energy consumption in J
			
 
				+Return expected energy consumption in J
			
 
				 
			
 
				 \fn double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				-Returns expected conversion time in ms (multiformat interface only)
			
 
				+Return expected conversion time in ms (multiformat interface only)
			
 
				 
			
 
				 \fn int starpu_get_prefetch_flag(void)
			
 
				 \ingroup API_Scheduling_Policy
			
--- a/doc/doxygen/chapters/api/standard_memory_library.doxy
+++ b/doc/doxygen/chapters/api/standard_memory_library.doxy
@@ -45,7 +45,8 @@ to indicate that while the memory allocation should be kept in the limits
 
				 defined for ::STARPU_MALLOC_COUNT, no reclaiming should be performed by
			
 
				 starpu_malloc_flags() itself, thus potentially overflowing the
			
 
				 memory node a bit. StarPU will reclaim memory after next task termination,
			
 
				-according to \ref STARPU_MINIMUM_AVAILABLE_MEM and \ref STARPU_TARGET_AVAILABLE_MEM
			
 
				+according to the \ref STARPU_MINIMUM_AVAILABLE_MEM, \ref STARPU_TARGET_AVAILABLE_MEM,
			
 
				+\ref STARPU_MINIMUM_CLEAN_BUFFERS, and \ref STARPU_TARGET_CLEAN_BUFFERS
			
 
				 environment variables. If ::STARPU_MEMORY_WAIT is set, no overflowing will happen,
			
 
				 starpu_malloc_flags() will wait for other eviction mechanisms to release enough memory.
			
 
				 
			
@@ -60,20 +61,20 @@ and write to normally, but get bogus values.
 
				 
			
 
				 \fn int starpu_malloc_flags(void **A, size_t dim, int flags)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-Performs a memory allocation based on the constraints defined
			
 
				+Perform a memory allocation based on the constraints defined
			
 
				 by the given flag.
			
 
				 
			
 
				 \fn void starpu_malloc_set_align(size_t align)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-This function sets an alignment constraints for starpu_malloc()
			
 
				+Set an alignment constraints for starpu_malloc()
			
 
				 allocations. \p align must be a power of two. This is for instance called
			
 
				 automatically by the OpenCL driver to specify its own alignment
			
 
				 constraints.
			
 
				 
			
 
				 \fn int starpu_malloc(void **A, size_t dim)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-This function allocates data of the given size \p dim in main memory, and
			
 
				-returns the pointer to the allocated data through \p A.
			
 
				+Allocate data of the given size \p dim in main memory, and
			
 
				+return the pointer to the allocated data through \p A.
			
 
				 It will also try to pin it in CUDA or OpenCL, so that data transfers
			
 
				 from this buffer can be asynchronous, and thus permit data transfer
			
 
				 and computation overlapping. The allocated buffer must be freed thanks
			
@@ -81,24 +82,23 @@ to the starpu_free() function.
 
				 
			
 
				 \fn int starpu_free(void *A)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-This function frees memory which has previously been allocated
			
 
				-with starpu_malloc().
			
 
				+Free memory which has previously been allocated with starpu_malloc().
			
 
				 
			
 
				 \fn int starpu_free_flags(void *A, size_t dim, int flags)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-This function frees memory by specifying its size. The given
			
 
				+Free memory by specifying its size. The given
			
 
				 flags should be consistent with the ones given to starpu_malloc_flags()
			
 
				 when allocating the memory.
			
 
				 
			
 
				 \fn int starpu_memory_pin(void *addr, size_t size)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-This function pins the given memory area, so that CPU-GPU transfers can be done
			
 
				+Pin the given memory area, so that CPU-GPU transfers can be done
			
 
				 asynchronously with DMAs. The memory must be unpinned with
			
 
				 starpu_memory_unpin() before being freed. Returns 0 on success, -1 on error.
			
 
				 
			
 
				 \fn int starpu_memory_unpin(void *addr, size_t size)
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-This function unpins the given memory area previously pinned with
			
 
				+Unpin the given memory area previously pinned with
			
 
				 starpu_memory_pin(). Returns 0 on success, -1 on error.
			
 
				 
			
 
				 \fn ssize_t starpu_memory_get_total(unsigned node)
			
@@ -109,7 +109,7 @@ on the node. Otherwise return -1.
 
				 
			
 
				 \fn ssize_t starpu_memory_get_total_all_nodes()
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-return the amount of total memory on all memory nodes for whose a memory limit
			
 
				+Return the amount of total memory on all memory nodes for whose a memory limit
			
 
				 is defined (see Section \ref HowToLimitMemoryPerNode).
			
 
				 
			
 
				 \fn ssize_t starpu_memory_get_available(unsigned node)
			
@@ -120,7 +120,7 @@ on the node. Otherwise return -1.
 
				 
			
 
				 \fn ssize_t starpu_memory_get_available_all_nodes()
			
 
				 \ingroup API_Standard_Memory_Library
			
 
				-return the amount of available memory on all memory nodes for whose a memory limit
			
 
				+Return the amount of available memory on all memory nodes for whose a memory limit
			
 
				 is defined (see Section \ref HowToLimitMemoryPerNode).
			
 
				 
			
 
				 \fn int starpu_memory_allocate(unsigned node, size_t size, int flags)
			
--- a/doc/doxygen/chapters/api/task_lists.doxy
+++ b/doc/doxygen/chapters/api/task_lists.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011, 2012 INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -12,9 +12,9 @@
 
				 Stores a double-chained list of tasks
			
 
				 \ingroup API_Task_Lists
			
 
				 \var struct starpu_task *starpu_task_list::head
			
 
				-head of the list
			
 
				+    head of the list
			
 
				 \var struct starpu_task *starpu_task_list::tail
			
 
				-tail of the list
			
 
				+    tail of the list
			
 
				 
			
 
				 \fn void starpu_task_list_init(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
--- a/doc/doxygen/chapters/api/threads.doxy
+++ b/doc/doxygen/chapters/api/threads.doxy
@@ -15,190 +15,171 @@ mode is enabled (\ref SimGridSupport).
 
				 
			
 
				 \def STARPU_PTHREAD_CREATE_ON
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_create_on() and aborts on error.
			
 
				+Call starpu_pthread_create_on() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_CREATE
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_create() and aborts on error.
			
 
				+Call starpu_pthread_create() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_MUTEX_INIT
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_mutex_init() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_mutex_init() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_MUTEX_DESTROY
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_mutex_destroy() and
			
 
				-aborts on error.
			
 
				+Call starpu_pthread_mutex_destroy() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_MUTEX_LOCK
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_mutex_lock() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_mutex_lock() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_MUTEX_UNLOCK
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_mutex_unlock() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_mutex_unlock() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_KEY_CREATE
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_key_create() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_key_create() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_KEY_DELETE
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_key_delete() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_key_delete() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_SETSPECIFIC
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_setspecific() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_setspecific() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_GETSPECIFIC
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_getspecific() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_getspecific() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_RWLOCK_INIT
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_rwlock_init() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_rwlock_init() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_RWLOCK_RDLOCK
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_rwlock_rdlock() and
			
 
				-aborts on error.
			
 
				+Call starpu_pthread_rwlock_rdlock() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_RWLOCK_WRLOCK
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_rwlock_wrlock() and
			
 
				-aborts on error.
			
 
				+Call starpu_pthread_rwlock_wrlock() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_RWLOCK_UNLOCK
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_rwlock_unlock() and
			
 
				-aborts on error.
			
 
				+Call starpu_pthread_rwlock_unlock() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_RWLOCK_DESTROY
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_rwlock_destroy() and
			
 
				-aborts on error.
			
 
				+Call starpu_pthread_rwlock_destroy() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_COND_INIT
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_cond_init() and aborts on error.
			
 
				+Call starpu_pthread_cond_init() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_COND_DESTROY
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_cond_destroy() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_cond_destroy() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_COND_SIGNAL
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_cond_signal() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_cond_signal() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_COND_BROADCAST
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_cond_broadcast() and
			
 
				-aborts on error.
			
 
				+Call starpu_pthread_cond_broadcast() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_COND_WAIT
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_cond_wait() and aborts on error.
			
 
				+Call starpu_pthread_cond_wait() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_BARRIER_INIT
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_barrier_init() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_barrier_init() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_BARRIER_DESTROY
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_barrier_destroy() and
			
 
				-aborts on error.
			
 
				+Call starpu_pthread_barrier_destroy() and abort on error.
			
 
				 
			
 
				 \def STARPU_PTHREAD_BARRIER_WAIT
			
 
				 \ingroup API_Threads
			
 
				-This macro calls the function starpu_pthread_barrier_wait() and aborts
			
 
				-on error.
			
 
				+Call starpu_pthread_barrier_wait() and abort on error.
			
 
				 
			
 
				 \fn int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, int where)
			
 
				 \ingroup API_Threads
			
 
				 
			
 
				 \fn int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg)
			
 
				 \ingroup API_Threads
			
 
				-This function starts a new thread in the calling process.  The new
			
 
				+Start a new thread in the calling process. The new
			
 
				 thread starts execution by invoking \p start_routine; \p arg is passed
			
 
				 as the sole argument of \p start_routine.
			
 
				 
			
 
				 \fn int starpu_pthread_join(starpu_pthread_t thread, void **retval)
			
 
				 \ingroup API_Threads
			
 
				-This function waits for the thread specified by \p thread to
			
 
				+Wait for the thread specified by \p thread to
			
 
				 terminate.  If that thread has already terminated, then the function
			
 
				 returns immediately. The thread specified by \p thread must be
			
 
				 joinable.
			
 
				 
			
 
				 \fn int starpu_pthread_exit(void *retval)
			
 
				 \ingroup API_Threads
			
 
				-This function terminates the calling thread and returns a value via
			
 
				+Terminate the calling thread and return a value via
			
 
				 \p retval that (if the thread is joinable) is available to another thread
			
 
				 in the same process that calls starpu_pthread_join().
			
 
				 
			
 
				 \fn int starpu_pthread_attr_init(starpu_pthread_attr_t *attr)
			
 
				 \ingroup API_Threads
			
 
				-This function initializes the thread attributes object pointed to by
			
 
				+Initialize the thread attributes object pointed to by
			
 
				 \p attr with default attribute values.
			
 
				 
			
 
				-It does not do anything when the simulated performance mode is enabled
			
 
				+Do not do anything when the simulated performance mode is enabled
			
 
				 (\ref SimGridSupport).
			
 
				 
			
 
				 \fn int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr)
			
 
				 \ingroup API_Threads
			
 
				-This function destroys a thread attributes object which is no longer
			
 
				+Destroy a thread attributes object which is no longer
			
 
				 required. Destroying a thread attributes object has no effect on
			
 
				 threads that were created using that object.
			
 
				 
			
 
				-It does not do anything when the simulated performance mode is enabled
			
 
				+Do not do anything when the simulated performance mode is enabled
			
 
				 (\ref SimGridSupport).
			
 
				 
			
 
				 \fn int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate)
			
 
				 \ingroup API_Threads
			
 
				-This function sets the detach state attribute of the thread attributes
			
 
				+Set the detach state attribute of the thread attributes
			
 
				 object referred to by \p attr to the value specified in \p
			
 
				 detachstate.  The detach state attribute determines whether a thread
			
 
				 created using the thread attributes object \p attr will be created in
			
 
				 a joinable or a detached state.
			
 
				 
			
 
				-It does not do anything when the simulated performance mode is enabled
			
 
				+Do not do anything when the simulated performance mode is enabled
			
 
				 (\ref SimGridSupport).
			
 
				 
			
 
				 \fn int starpu_pthread_mutex_init(starpu_pthread_mutex_t *mutex, const starpu_pthread_mutexattr_t *mutexattr)
			
 
				 \ingroup API_Threads
			
 
				-This function initializes the mutex object pointed to by \p mutex
			
 
				+Initialize the mutex object pointed to by \p mutex
			
 
				 according to the mutex attributes specified in \p mutexattr.  If \p
			
 
				 mutexattr is <c>NULL</c>, default attributes are used instead.
			
 
				 
			
 
				 \fn int starpu_pthread_mutex_destroy(starpu_pthread_mutex_t *mutex)
			
 
				 \ingroup API_Threads
			
 
				-This function destroys a mutex object, freeing the resources it might
			
 
				+Destroy a mutex object, and free the resources it might
			
 
				 hold. The mutex must be unlocked on entrance.
			
 
				 
			
 
				 \fn int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex)
			
 
				 \ingroup API_Threads
			
 
				-This function locks the given mutex. If the mutex is currently
			
 
				+Lock the given \p mutex. If \p mutex is currently
			
 
				 unlocked, it becomes locked and owned by the calling thread, and the
			
 
				-function returns immediately. If the mutex is already locked by
			
 
				-another thread, the function suspends the calling thread until the
			
 
				-mutex is unlocked.
			
 
				+function returns immediately. If \p mutex is already locked by
			
 
				+another thread, the function suspends the calling thread until
			
 
				+\p mutex is unlocked.
			
 
				 
			
 
				 This function also produces trace when the configure option
			
 
				 \ref enable-fxt-lock "--enable-fxt-lock" is enabled.
			
 
				 
			
 
				 \fn int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
			
 
				 \ingroup API_Threads
			
 
				-This function unlocks the given mutex. The mutex is assumed to be
			
 
				+Unlock the given \p mutex. The mutex is assumed to be
			
 
				 locked and owned by the calling thread on entrance to
			
 
				 starpu_pthread_mutex_unlock().
			
 
				 
			
@@ -207,7 +188,7 @@ This function also produces trace when the configure option
 
				 
			
 
				 \fn int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
			
 
				 \ingroup API_Threads
			
 
				-This function behaves identically to starpu_pthread_mutex_lock(),
			
 
				+Behave identically to starpu_pthread_mutex_lock(),
			
 
				 except that it does not block the calling thread if the mutex is
			
 
				 already locked by another thread (or by the calling thread in the case
			
 
				 of a ``fast''  mutex). Instead, the function returns immediately with
			
@@ -218,7 +199,7 @@ This function also produces trace when the configure option
 
				 
			
 
				 \typedef STARPU_PTHREAD_MUTEX_INITIALIZER
			
 
				 \ingroup API_Threads
			
 
				-This macro initializes the mutex given in parameter.
			
 
				+Initialize the mutex given in parameter.
			
 
				 
			
 
				 \fn int starpu_pthread_mutexattr_gettype(const starpu_pthread_mutexattr_t *attr, int *type)
			
 
				 \ingroup API_Threads
			
@@ -238,52 +219,52 @@ todo
 
				 
			
 
				 \fn int starpu_pthread_key_create(starpu_pthread_key_t *key, void (*destr_function) (void *))
			
 
				 \ingroup API_Threads
			
 
				-This function allocates a new TSD key. The key is stored in the
			
 
				+Allocate a new TSD key. The key is stored in the
			
 
				 location pointed to by \p key.
			
 
				 
			
 
				 \fn int starpu_pthread_key_delete(starpu_pthread_key_t key)
			
 
				 \ingroup API_Threads
			
 
				-This function deallocates a TSD key. It does not check whether
			
 
				+Deallocate a TSD key. Do not check whether
			
 
				 non-<c>NULL</c> values are associated with that key in the currently
			
 
				 executing threads, nor call the destructor function associated with
			
 
				 the key.
			
 
				 
			
 
				 \fn int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
			
 
				 \ingroup API_Threads
			
 
				-This function changes the value associated with \p key in the calling
			
 
				+Change the value associated with \p key in the calling
			
 
				 thread, storing the given \p pointer instead.
			
 
				 
			
 
				 \fn void *starpu_pthread_getspecific(starpu_pthread_key_t key)
			
 
				 \ingroup API_Threads
			
 
				-This function returns the value associated with \p key on success, and
			
 
				+Return the value associated with \p key on success, and
			
 
				 <c>NULL</c> on error.
			
 
				 
			
 
				 \typedef STARPU_PTHREAD_COND_INITIALIZER
			
 
				 \ingroup API_Threads
			
 
				-This macro initializes the condition variable given in parameter.
			
 
				+Initialize the condition variable given in parameter.
			
 
				 
			
 
				 \fn int starpu_pthread_cond_init(starpu_pthread_cond_t *cond, starpu_pthread_condattr_t *cond_attr)
			
 
				 \ingroup API_Threads
			
 
				-This function initializes the condition variable \p cond, using the
			
 
				+Initialize the condition variable \p cond, using the
			
 
				 condition attributes specified in \p cond_attr, or default attributes
			
 
				 if \p cond_attr is <c>NULL</c>.
			
 
				 
			
 
				 \fn int starpu_pthread_cond_signal(starpu_pthread_cond_t *cond)
			
 
				 \ingroup API_Threads
			
 
				-This function restarts one of the threads that are waiting on the
			
 
				+Restart one of the threads that are waiting on the
			
 
				 condition variable \p cond. If no threads are waiting on \p cond,
			
 
				 nothing happens. If several threads are waiting on \p cond, exactly
			
 
				-one is restarted, but it not specified which.
			
 
				+one is restarted, but it is not specified which.
			
 
				 
			
 
				 \fn int starpu_pthread_cond_broadcast(starpu_pthread_cond_t *cond)
			
 
				 \ingroup API_Threads
			
 
				-This function restarts all the threads that are waiting on the
			
 
				+Restart all the threads that are waiting on the
			
 
				 condition variable \p cond. Nothing happens if no threads are waiting on \p cond.
			
 
				 
			
 
				 \fn int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
			
 
				 \ingroup API_Threads
			
 
				-This function atomically unlocks the mutex (as per
			
 
				-starpu_pthread_mutex_unlock()) and waits for the condition variable \p cond
			
 
				+Atomically unlock \p mutex (as per
			
 
				+starpu_pthread_mutex_unlock()) and wait for the condition variable \p cond
			
 
				 to be signaled. The thread execution is suspended and does not consume
			
 
				 any CPU time until the condition variable is signaled. The mutex must
			
 
				 be locked by the calling thread on entrance to
			
@@ -295,27 +276,27 @@ This function also produces trace when the configure option
 
				 
			
 
				 \fn int starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime)
			
 
				 \ingroup API_Threads
			
 
				-This function atomically unlocks \p mutex and waits on \p cond, as
			
 
				-starpu_pthread_cond_wait() does, but it also bounds the duration of
			
 
				-the wait.
			
 
				+Atomicall unlocks \p mutex and wait on \p cond, as
			
 
				+starpu_pthread_cond_wait() does, but also bound the duration of
			
 
				+the wait with \p abstime.
			
 
				 
			
 
				 \fn int starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond)
			
 
				 \ingroup API_Threads
			
 
				-This function destroys a condition variable, freeing the resources it
			
 
				+Destroy a condition variable, freeing the resources it
			
 
				 might hold. No threads must be waiting on the condition variable on
			
 
				 entrance to the function.
			
 
				 
			
 
				 \fn int starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr)
			
 
				 \ingroup API_Threads
			
 
				-This function is the same as starpu_pthread_mutex_init().
			
 
				+Similar to starpu_pthread_mutex_init().
			
 
				 
			
 
				 \fn int starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock)
			
 
				 \ingroup API_Threads
			
 
				-This function is the same as starpu_pthread_mutex_destroy().
			
 
				+Similar to starpu_pthread_mutex_destroy().
			
 
				 
			
 
				 \fn int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 \ingroup API_Threads
			
 
				-This function is the same as starpu_pthread_mutex_lock().
			
 
				+Similar to starpu_pthread_mutex_lock().
			
 
				 
			
 
				 \fn int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 \ingroup API_Threads
			
@@ -323,7 +304,7 @@ todo
 
				 
			
 
				 \fn int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 \ingroup API_Threads
			
 
				-This function is the same as starpu_pthread_mutex_lock().
			
 
				+Similar to starpu_pthread_mutex_lock().
			
 
				 
			
 
				 \fn int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 \ingroup API_Threads
			
@@ -331,7 +312,7 @@ todo
 
				 
			
 
				 \fn int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
			
 
				 \ingroup API_Threads
			
 
				-This function is the same as starpu_pthread_mutex_unlock().
			
 
				+Similar to starpu_pthread_mutex_unlock().
			
 
				 
			
 
				 \fn int starpu_pthread_barrier_init(starpu_pthread_barrier_t *barrier, const starpu_pthread_barrierattr_t *attr, unsigned count)
			
 
				 \ingroup API_Threads
			
@@ -367,7 +348,7 @@ todo
 
				 
			
 
				 \fn void starpu_sleep(float nb_sec)
			
 
				 \ingroup API_Threads
			
 
				-This is the same as calling Unix' sleep function, except that it takes a float
			
 
				+Similar to calling Unix' \c sleep function, except that it takes a float
			
 
				 to allow sub-second sleeping, and when StarPU is compiled in simgrid mode it
			
 
				 does not really sleep but just makes simgrid record that the thread has taken
			
 
				 some time to sleep.
			
--- a/doc/doxygen/chapters/api/toolbox.doxy
+++ b/doc/doxygen/chapters/api/toolbox.doxy
@@ -17,52 +17,51 @@ Return true (non-zero) if GCC version \p maj.\p min or later is being used (macr
 
				 
			
 
				 \def STARPU_UNLIKELY
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro allows programmers to mark an expression as unlikely.
			
 
				+When building with a GNU C Compiler, allow programmers to mark an expression as unlikely.
			
 
				 
			
 
				 \def STARPU_LIKELY
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro allows programmers to mark an expression as likely.
			
 
				+When building with a GNU C Compiler, allow programmers to mark an expression as likely.
			
 
				 
			
 
				 \def STARPU_ATTRIBUTE_UNUSED
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro is defined to __attribute__((unused))
			
 
				+When building with a GNU C Compiler, defined to __attribute__((unused))
			
 
				 
			
 
				 \def STARPU_ATTRIBUTE_INTERNAL
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro is defined to __attribute__((visibility ("internal")))
			
 
				+When building with a GNU C Compiler, defined to __attribute__((visibility ("internal")))
			
 
				 
			
 
				 \def STARPU_ATTRIBUTE_MALLOC
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro is defined to __attribute__((malloc))
			
 
				+When building with a GNU C Compiler, defined to __attribute__((malloc))
			
 
				 
			
 
				 \def STARPU_ATTRIBUTE_WARN_UNUSED_RESULT
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro is defined to __attribute__((warn_unused_result))
			
 
				+When building with a GNU C Compiler, defined to __attribute__((warn_unused_result))
			
 
				 
			
 
				 \def STARPU_ATTRIBUTE_PURE
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro is defined to __attribute__((pure))
			
 
				+When building with a GNU C Compiler, defined to __attribute__((pure))
			
 
				 
			
 
				 \def STARPU_ATTRIBUTE_ALIGNED
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro is defined to__attribute__((aligned(size)))
			
 
				+When building with a GNU C Compiler, defined to__attribute__((aligned(size)))
			
 
				 
			
 
				 \def STARPU_WARN_UNUSED_RESULT
			
 
				 \ingroup API_Toolbox
			
 
				-When building with a GNU C Compiler, this macro is defined to__attribute__((__warn_unused_result__))
			
 
				+When building with a GNU C Compiler, defined to__attribute__((__warn_unused_result__))
			
 
				 
			
 
				 \def STARPU_POISON_PTR
			
 
				 \ingroup API_Toolbox
			
 
				-This macro defines a value which can be used to mark pointers as
			
 
				-invalid values.
			
 
				+Define a value which can be used to mark pointers as invalid values.
			
 
				 
			
 
				 \def STARPU_MIN
			
 
				 \ingroup API_Toolbox
			
 
				-This macro returns the min of the two parameters.
			
 
				+Return the min of the two parameters.
			
 
				 
			
 
				 \def STARPU_MAX
			
 
				 \ingroup API_Toolbox
			
 
				-This macro returns the max of the two parameters.
			
 
				+Return the max of the two parameters.
			
 
				 
			
 
				 \def STARPU_ASSERT
			
 
				 \ingroup API_Toolbox
			
@@ -77,21 +76,19 @@ given message will be displayed.
 
				 
			
 
				 \def STARPU_ABORT
			
 
				 \ingroup API_Toolbox
			
 
				-This macro aborts the program.
			
 
				+Abort the program.
			
 
				 
			
 
				 \def STARPU_ABORT_MSG
			
 
				 \ingroup API_Toolbox
			
 
				-This macro aborts the program, and displays the given message.
			
 
				+Abort the program, and display the given message.
			
 
				 
			
 
				 \def STARPU_CHECK_RETURN_VALUE
			
 
				 \ingroup API_Toolbox
			
 
				-If \p err has a value which is not 0, the given message is displayed
			
 
				-before aborting.
			
 
				+Abort the program (after displaying \p message) if \p err has a value which is not 0.
			
 
				 
			
 
				 \def STARPU_CHECK_RETURN_VALUE_IS
			
 
				 \ingroup API_Toolbox
			
 
				-If \p err has a value which is not \p value, the given message is displayed
			
 
				-before aborting.
			
 
				+Abort the program (after displaying \p message) if \p err is different from \p value.
			
 
				 
			
 
				 \def STARPU_RMB
			
 
				 \ingroup API_Toolbox
			
@@ -103,9 +100,9 @@ This macro can be used to do a synchronization.
 
				 
			
 
				 \fn int starpu_get_env_number(const char *str)
			
 
				 \ingroup API_Toolbox
			
 
				-If \p str is the name of a existing environment variable which is
			
 
				-defined to an integer, the function returns the value of the integer.
			
 
				-It returns 0 otherwise.
			
 
				+Return the integer value of the environment variable named \p str.
			
 
				+Return 0 otherwise (the variable does not exist or has a non-integer
			
 
				+value).
			
 
				 
			
 
				 */
			
 
				 
			
--- a/doc/doxygen/chapters/api/top.doxy
+++ b/doc/doxygen/chapters/api/top.doxy
@@ -12,163 +12,149 @@
 
				 \ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Data type
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_BOOLEAN
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_INTEGER
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_FLOAT
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \enum starpu_top_param_type
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Parameter type
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_BOOLEAN
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_INTEGER
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_FLOAT
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_ENUM
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \enum starpu_top_message_type
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Message type
			
 
				 \var starpu_top_message_type::TOP_TYPE_GO
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_SET
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_CONTINUE
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_ENABLE
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_DISABLE
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_DEBUG
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_UNKNOW
			
 
				-\ingroup API_StarPUTop_Interface
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \struct starpu_top_data
			
 
				 todo
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				 \var unsigned int starpu_top_data::id
			
 
				-todo
			
 
				+    todo
			
 
				 \var const char *starpu_top_data::name
			
 
				-todo
			
 
				+    todo
			
 
				 \var int starpu_top_data::int_min_value
			
 
				-todo
			
 
				+    todo
			
 
				 \var int starpu_top_data::int_max_value
			
 
				-todo
			
 
				+    todo
			
 
				 \var double starpu_top_data::double_min_value
			
 
				-todo
			
 
				+    todo
			
 
				 \var double starpu_top_data::double_max_value
			
 
				-todo
			
 
				+    todo
			
 
				 \var int starpu_top_data::active
			
 
				-todo
			
 
				+    todo
			
 
				 \var enum starpu_top_data_type starpu_top_data::type
			
 
				-todo
			
 
				+    todo
			
 
				 \var struct starpu_top_data *starpu_top_data::next
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \struct starpu_top_param
			
 
				 todo
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				 \var unsigned int starpu_top_param::id
			
 
				-todo
			
 
				+    todo
			
 
				 \var const char *starpu_top_param::name
			
 
				-todo
			
 
				+    todo
			
 
				 \var enum starpu_top_param_type starpu_top_param::type
			
 
				-todo
			
 
				+    todo
			
 
				 \var void *starpu_top_param::value
			
 
				-todo
			
 
				+    todo
			
 
				 \var char **starpu_top_param::enum_values
			
 
				-only for enum type can be <c>NULL</c>
			
 
				+    only for enum type can be <c>NULL</c>
			
 
				 \var int starpu_top_param::nb_values
			
 
				-todo
			
 
				+    todo
			
 
				 \var void (*starpu_top_param::callback)(struct starpu_top_param*)
			
 
				-todo
			
 
				+    todo
			
 
				 \var int starpu_top_param::int_min_value
			
 
				-only for integer type
			
 
				+    only for integer type
			
 
				 \var int starpu_top_param::int_max_value
			
 
				-todo
			
 
				+    todo
			
 
				 \var double starpu_top_param::double_min_value
			
 
				-only for double type
			
 
				+    only for double type
			
 
				 \var double starpu_top_param::double_max_value
			
 
				-todo
			
 
				+    todo
			
 
				 \var struct starpu_top_param *starpu_top_param::next
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 @name Functions to call before the initialisation
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				 
			
 
				 \fn struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name, int active)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function registers a data named \p data_name of type boolean.
			
 
				-If \p active is 0, the value will NOT be displayed to user by default.
			
 
				-Any other value will make the value displayed by default.
			
 
				+Register a data named \p data_name of type boolean.
			
 
				+If \p active is 0, the value will NOT be displayed to users.
			
 
				+Any other value will make the value displayed.
			
 
				 
			
 
				 \fn struct starpu_top_data *starpu_top_add_data_integer(const char *data_name, int minimum_value, int maximum_value, int active)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function registers a data named \p data_name of type integer. The
			
 
				-minimum and maximum value will be used to define the scale in the UI.
			
 
				-If \p active is 0, the value will NOT be displayed to user by default.
			
 
				-Any other value will make the value displayed by default.
			
 
				+Register a data named \p data_name of type integer. \p minimum_value
			
 
				+and \p maximum_value will be used to define the scale in the UI.
			
 
				+If \p active is 0, the value will NOT be displayed to users.
			
 
				+Any other value will make the value displayed.
			
 
				 
			
 
				 \fn struct starpu_top_data *starpu_top_add_data_float(const char *data_name, double minimum_value, double maximum_value, int active)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function registers a data named \p data_name of type float. The
			
 
				-minimum and maximum value will be used to define the scale in the UI.
			
 
				-If \p active is 0, the value will NOT be displayed to user by default.
			
 
				-Any other value will make the value displayed by default.
			
 
				+Register a data named \p data_name of type float. \p minimum_value and
			
 
				+\p maximum_value will be used to define the scale in the UI.
			
 
				+If \p active is 0, the value will NOT be displayed to users.
			
 
				+Any other value will make the value displayed.
			
 
				 
			
 
				 \fn struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name, int *parameter_field, void (*callback)(struct starpu_top_param*))
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function registers a parameter named \p parameter_name, of type
			
 
				-boolean. The \p callback function will be called when the parameter is
			
 
				-modified by the UI, and can be <c>NULL</c>.
			
 
				+Register a parameter named \p parameter_name, of type
			
 
				+boolean. If not \c NULL, the \p callback function will be called when
			
 
				+the parameter is modified by the UI.
			
 
				 
			
 
				 \fn struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name, double *parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function registers a parameter named \p param_name, of type
			
 
				-integer. Minimum and maximum value will be used to prevent users from setting
			
 
				-incorrect value. The \p callback function will be called when the
			
 
				-parameter is modified by the UI, and can be <c>NULL</c>.
			
 
				+Register a parameter named \p param_name, of type
			
 
				+integer. \p minimum_value and \p maximum_value will be used to prevent
			
 
				+users from setting incorrect value. If not \c NULL, the \p callback
			
 
				+function will be called when the parameter is modified by the UI.
			
 
				 
			
 
				 \fn struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name, int *parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function registers a parameter named \p param_name, of type float.
			
 
				-Minimum and maximum value will be used to prevent users from setting
			
 
				-incorrect value. The \p callback function will be called when the
			
 
				-parameter is modified by UI, and can be <c>NULL</c>.
			
 
				+Register a parameter named \p param_name, of type float.
			
 
				+\p minimum_value and \p maximum_value will be used to prevent users
			
 
				+from setting incorrect value. If not \c NULL, the \p callback function
			
 
				+will be called when the parameter is modified by the UI.
			
 
				 
			
 
				 \fn struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name, int *parameter_field, char **values, int nb_values, void (*callback)(struct starpu_top_param*))
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function registers a parameter named \p param_name, of type enum.
			
 
				-Minimum and maximum value will be used to prevent users from setting
			
 
				-incorrect value. The \p callback function will be called when the
			
 
				-parameter is modified by the UI, and can be <c>NULL</c>.
			
 
				+Register a parameter named \p param_name, of type enum.
			
 
				+\p values and \p nb_values will be used to prevent users from setting
			
 
				+incorrect value. If not \c NULL, the \p callback function will be
			
 
				+called when the parameter is modified by the UI.
			
 
				 
			
 
				 @name Initialisation
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				 
			
 
				 \fn void starpu_top_init_and_wait(const char *server_name)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function must be called when all parameters and data have been
			
 
				-registered AND initialised (for parameters). This function will wait
			
 
				+Must be called when all parameters and data have been
			
 
				+registered AND initialised (for parameters). It will wait
			
 
				 for a TOP to connect, send initialisation sentences, and wait for the
			
 
				 GO message.
			
 
				 
			
@@ -177,36 +163,35 @@ GO message.
 
				 
			
 
				 \fn void starpu_top_update_parameter(const struct starpu_top_param *param)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function should be called after every modification of a parameter
			
 
				-from something other than starpu_top. This function notices the UI that the
			
 
				-configuration changed.
			
 
				+Should be called after every modification of a parameter
			
 
				+from something other than starpu_top. It notices the UI that the
			
 
				+configuration has changed.
			
 
				 
			
 
				 \fn void starpu_top_update_data_boolean(const struct starpu_top_data *data, int value)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function updates the value of the starpu_top_data in the UI.
			
 
				+Update the boolean value of \p data to \p value the UI.
			
 
				 
			
 
				 \fn void starpu_top_update_data_integer(const struct starpu_top_data *data, int value)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function updates the value of the starpu_top_data in the UI.
			
 
				+Update the integer value of \p data to \p value the UI.
			
 
				 
			
 
				 \fn void starpu_top_update_data_float(const struct starpu_top_data *data, double value)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function updates the value of the starpu_top_data in the UI.
			
 
				+Update the float value of \p data to \p value the UI.
			
 
				 
			
 
				 \fn void starpu_top_task_prevision(struct starpu_task *task, int devid, unsigned long long start, unsigned long long end)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-This function notifies the UI that \p task is planned to run from \p start to \p end, on computation-core.
			
 
				+Notift the UI that \p task is planned to run from \p start to \p end, on computation-core.
			
 
				 
			
 
				 \fn void starpu_top_debug_log(const char *message)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-When running in debug mode, the function sends \p message to be displayed by the UI.
			
 
				+When running in debug mode, display \p message in the UI.
			
 
				 
			
 
				 \fn void starpu_top_debug_lock(const char *message)
			
 
				 \ingroup API_StarPUTop_Interface
			
 
				-When running in debug mode, the functions sends a message and waits for a continue message
			
 
				-from the UI to return. The lock (which creates a stop-point) should be
			
 
				+When running in debug mode, send \p message to the UI and wait for a
			
 
				+continue message to return. The lock (which creates a stop-point) should be
			
 
				 called only by the main thread. Calling it from more than one thread
			
 
				 is not supported.
			
 
				 
			
 
				 */
			
 
				-
			
--- a/doc/doxygen/chapters/api/tree.doxy
+++ b/doc/doxygen/chapters/api/tree.doxy
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				- * Copyright (C) 2014, 2016  CNRS
			
 
				+ * Copyright (C) 2014, 2016, 2017  CNRS
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -11,17 +11,17 @@
 
				 \struct starpu_tree
			
 
				 \ingroup API_Tree
			
 
				 \var struct starpu_tree **starpu_tree::nodes
			
 
				-todo
			
 
				+    todo
			
 
				 \var struct starpu_tree *starpu_tree::father
			
 
				-todo
			
 
				+    todo
			
 
				 \var int starpu_tree::arity
			
 
				-todo
			
 
				+    todo
			
 
				 \var int starpu_tree::id
			
 
				-todo
			
 
				+    todo
			
 
				 \var int starpu_tree::level
			
 
				-todo
			
 
				+    todo
			
 
				 \var int starpu_tree::is_pu
			
 
				-todo
			
 
				+    todo
			
 
				 
			
 
				 \fn void starpu_tree_reset_visited(struct starpu_tree *tree, char *visited)
			
 
				 \ingroup API_Tree
			
--- a/doc/doxygen/chapters/api/workers.doxy
+++ b/doc/doxygen/chapters/api/workers.doxy
@@ -25,62 +25,44 @@ considerably reduce memory used by StarPU data structures.
 
				 
			
 
				 \enum starpu_node_kind
			
 
				 \ingroup API_Workers_Properties
			
 
				-TODO
			
 
				+    TODO
			
 
				 \var starpu_node_kind::STARPU_UNUSED
			
 
				-\ingroup API_Workers_Properties
			
 
				-TODO
			
 
				-\ingroup API_Workers_Properties
			
 
				+    TODO
			
 
				 \var starpu_node_kind::STARPU_CPU_RAM
			
 
				-\ingroup API_Workers_Properties
			
 
				-TODO
			
 
				+    TODO
			
 
				 \var starpu_node_kind::STARPU_CUDA_RAM
			
 
				-\ingroup API_Workers_Properties
			
 
				-TODO
			
 
				+    TODO
			
 
				 \var starpu_node_kind::STARPU_OPENCL_RAM
			
 
				-\ingroup API_Workers_Properties
			
 
				-TODO
			
 
				+    TODO
			
 
				 \var starpu_node_kind::STARPU_DISK_RAM
			
 
				-\ingroup API_Workers_Properties
			
 
				-TODO
			
 
				+    TODO
			
 
				 \var starpu_node_kind::STARPU_MIC_RAM
			
 
				-\ingroup API_Workers_Properties
			
 
				-TODO
			
 
				+    TODO
			
 
				 \var starpu_node_kind::STARPU_SCC_RAM
			
 
				-\ingroup API_Workers_Properties
			
 
				-This node kind is not used anymore, but implementations in interfaces
			
 
				-will be useful for MPI.
			
 
				+    This node kind is not used anymore, but implementations in
			
 
				+    interfaces will be useful for MPI.
			
 
				 \var starpu_node_kind::STARPU_SCC_SHM
			
 
				-\ingroup API_Workers_Properties
			
 
				-TODO
			
 
				+    TODO
			
 
				 \var starpu_node_kind::STARPU_MPI_MS_RAM
			
 
				-\ingroup API_Workers_Properties
			
 
				-TODO
			
 
				+    TODO
			
 
				 
			
 
				 \enum starpu_worker_archtype
			
 
				 \ingroup API_Workers_Properties
			
 
				 Worker Architecture Type
			
 
				 \var starpu_worker_archtype::STARPU_ANY_WORKER
			
 
				-\ingroup API_Workers_Properties
			
 
				-any worker, used in the hypervisor
			
 
				+    any worker, used in the hypervisor
			
 
				 \var starpu_worker_archtype::STARPU_CPU_WORKER
			
 
				-\ingroup API_Workers_Properties
			
 
				-CPU core
			
 
				+    CPU core
			
 
				 \var starpu_worker_archtype::STARPU_CUDA_WORKER
			
 
				-\ingroup API_Workers_Properties
			
 
				-NVIDIA CUDA device
			
 
				+    NVIDIA CUDA device
			
 
				 \var starpu_worker_archtype::STARPU_OPENCL_WORKER
			
 
				-\ingroup API_Workers_Properties
			
 
				-OpenCL device
			
 
				+    OpenCL device
			
 
				 \var starpu_worker_archtype::STARPU_MIC_WORKER
			
 
				-\ingroup API_Workers_Properties
			
 
				-Intel MIC device
			
 
				+    Intel MIC device
			
 
				 \var starpu_worker_archtype::STARPU_SCC_WORKER
			
 
				-\ingroup API_Workers_Properties
			
 
				-Intel SCC device
			
 
				+    Intel SCC device
			
 
				 \var starpu_worker_archtype::STARPU_MPI_MS_WORKER
			
 
				-\ingroup API_Workers_Properties
			
 
				-MPI Slave device
			
 
				-
			
 
				+    MPI Slave device
			
 
				 
			
 
				 \struct starpu_worker_collection
			
 
				 \ingroup API_Workers_Properties
			
@@ -112,13 +94,13 @@ structures(like tree) implementations are foreseen.
 
				 \var enum starpu_worker_collection_type starpu_worker_collection::type
			
 
				         The type of structure
			
 
				 \var unsigned (*starpu_worker_collection::has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
			
 
				-        Checks if there is another element in collection
			
 
				+        Check if there is another element in collection
			
 
				 \var int (*starpu_worker_collection::get_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
			
 
				-        return the next element in the collection
			
 
				+        Return the next element in the collection
			
 
				 \var int (*starpu_worker_collection::add)(struct starpu_worker_collection *workers, int worker)
			
 
				-        add a new element in the collection
			
 
				+        Add a new element in the collection
			
 
				 \var int (*starpu_worker_collection::remove)(struct starpu_worker_collection *workers, int worker)
			
 
				-        remove an element from the collection
			
 
				+        Remove an element from the collection
			
 
				 \var void (*starpu_worker_collection::init)(struct starpu_worker_collection *workers)
			
 
				         Initialize the collection
			
 
				 \var void (*starpu_worker_collection::deinit)(struct starpu_worker_collection *workers)
			
@@ -132,84 +114,74 @@ structures(like tree) implementations are foreseen.
 
				 \ingroup API_Workers_Properties
			
 
				 Types of structures the worker collection can implement
			
 
				 \var starpu_worker_collection_type::STARPU_WORKER_LIST
			
 
				-\ingroup API_Workers_Properties
			
 
				-The collection is an array
			
 
				+    The collection is an array
			
 
				 \var starpu_worker_collection_type::STARPU_WORKER_TREE
			
 
				-\ingroup API_Workers_Properties
			
 
				-The collection is a tree
			
 
				+    The collection is a tree
			
 
				 
			
 
				 \struct starpu_sched_ctx_iterator
			
 
				 \ingroup API_Workers_Properties
			
 
				 Structure needed to iterate on the collection
			
 
				 \var int starpu_sched_ctx_iterator::cursor
			
 
				-The index of the current worker in the collection, needed when iterating on
			
 
				-the collection.
			
 
				-
			
 
				+    The index of the current worker in the collection, needed when
			
 
				+    iterating on the collection.
			
 
				 
			
 
				 \fn unsigned starpu_worker_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the number of workers (i.e. processing
			
 
				-units executing StarPU tasks). The returned value should be at most
			
 
				-\ref STARPU_NMAXWORKERS.
			
 
				+Return the number of workers (i.e. processing units executing StarPU
			
 
				+tasks). The returned value should be at most \ref STARPU_NMAXWORKERS.
			
 
				 
			
 
				 \fn int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
			
 
				 \ingroup API_Workers_Properties
			
 
				-Returns the number of workers of the given type. A positive (or
			
 
				-<c>NULL</c>) value is returned in case of success, <c>-EINVAL</c> indicates that the
			
 
				-type is not valid otherwise.
			
 
				+Return the number of workers of \p type. A positive (or
			
 
				+<c>NULL</c>) value is returned in case of success, <c>-EINVAL</c>
			
 
				+indicates that \p type is not valid otherwise.
			
 
				 
			
 
				 \fn unsigned starpu_cpu_worker_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the number of CPUs controlled by StarPU. The
			
 
				-returned value should be at most \ref STARPU_MAXCPUS.
			
 
				+Return the number of CPUs controlled by StarPU. The returned value should be at most \ref STARPU_MAXCPUS.
			
 
				 
			
 
				 \fn unsigned starpu_cuda_worker_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the number of CUDA devices controlled by
			
 
				-StarPU. The returned value should be at most \ref STARPU_MAXCUDADEVS.
			
 
				+Return the number of CUDA devices controlled by StarPU. The returned value should be at most \ref STARPU_MAXCUDADEVS.
			
 
				 
			
 
				 \fn unsigned starpu_mic_worker_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the number of MIC workers controlled by StarPU.
			
 
				+Return the number of MIC workers controlled by StarPU.
			
 
				 
			
 
				 \fn unsigned starpu_mic_device_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the number of MIC devices controlled by StarPU.
			
 
				-The returned value should be at most \ref STARPU_MAXMICDEVS.
			
 
				+Return the number of MIC devices controlled by StarPU. The returned value should be at most \ref STARPU_MAXMICDEVS.
			
 
				 
			
 
				 \fn unsigned starpu_mpi_ms_worker_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the number of MPI Master Slave workers controlled by StarPU.
			
 
				+Return the number of MPI Master Slave workers controlled by StarPU.
			
 
				 
			
 
				 \fn unsigned starpu_scc_worker_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the number of SCC devices controlled by StarPU.
			
 
				-The returned value should be at most \ref STARPU_MAXSCCDEVS.
			
 
				+Return the number of SCC devices controlled by StarPU. The returned value should be at most \ref STARPU_MAXSCCDEVS.
			
 
				 
			
 
				 \fn unsigned starpu_opencl_worker_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the number of OpenCL devices controlled by
			
 
				-StarPU. The returned value should be at most \ref STARPU_MAXOPENCLDEVS.
			
 
				+Return the number of OpenCL devices controlled by StarPU. The returned value should be at most \ref STARPU_MAXOPENCLDEVS.
			
 
				 
			
 
				 \fn int starpu_worker_get_id(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the identifier of the current worker, i.e
			
 
				-the one associated to the calling thread. The returned value is either
			
 
				--1 if the current context is not a StarPU worker (i.e. when called
			
 
				-from the application outside a task or a callback), or an integer
			
 
				-between 0 and starpu_worker_get_count() - 1.
			
 
				+Return the identifier of the current worker, i.e the one associated to
			
 
				+the calling thread. The returned value is either -1 if the current
			
 
				+context is not a StarPU worker (i.e. when called from the application
			
 
				+outside a task or a callback), or an integer between 0 and
			
 
				+starpu_worker_get_count() - 1.
			
 
				 
			
 
				 \fn unsigned starpu_worker_get_id_check(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This is the same as starpu_worker_get_id(), but aborts when called from outside a
			
 
				-worker (i.e. when starpu_worker_get_id() would return -1).
			
 
				+Similar to starpu_worker_get_id(), but abort when called from outside
			
 
				+a worker (i.e. when starpu_worker_get_id() would return -1).
			
 
				 
			
 
				 \fn unsigned starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, unsigned maxsize)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function gets the list of identifiers of workers with the
			
 
				-given type. It fills the array \p workerids with the identifiers of the
			
 
				-workers that have the type indicated in the first argument. The
			
 
				-argument \p maxsize indicates the size of the array \p workerids. The returned
			
 
				+Get the list of identifiers of workers of \p type. Fill the array \p
			
 
				+workerids with the identifiers of the \p workers. The argument \p
			
 
				+maxsize indicates the size of the array \p workerids. The returned
			
 
				 value gives the number of identifiers that were put in the array.
			
 
				 <c>-ERANGE</c> is returned is \p maxsize is lower than the number of workers
			
 
				 with the appropriate type: in that case, the array is filled with the
			
@@ -220,18 +192,19 @@ equal to \ref STARPU_NMAXWORKERS.
 
				 
			
 
				 \fn int starpu_worker_get_by_type(enum starpu_worker_archtype type, int num)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This returns the identifier of the \p num -th worker that has the
			
 
				-specified type type. If there are no such worker, -1 is returned.
			
 
				+Return the identifier of the \p num -th worker that has the
			
 
				+specified \p type. If there is no such worker, -1 is returned.
			
 
				 
			
 
				 \fn int starpu_worker_get_by_devid(enum starpu_worker_archtype type, int devid)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This returns the identifier of the worker that has the specified type
			
 
				-\p type and device id \p devid (which may not be the n-th, if some
			
 
				-devices are skipped for instance). If there are no such worker, -1 is returned.
			
 
				+Return the identifier of the worker that has the specified \p type
			
 
				+and device id \p devid (which may not be the n-th, if some
			
 
				+devices are skipped for instance). If there is no such worker, -1 is
			
 
				+returned.
			
 
				 
			
 
				 \fn int starpu_worker_get_devid(int id)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the device id of the given worker. The
			
 
				+Return the device id of the worker \p id. The
			
 
				 worker should be identified with the value returned by the
			
 
				 starpu_worker_get_id() function. In the case of a CUDA worker, this
			
 
				 device identifier is the logical device identifier exposed by CUDA
			
@@ -242,32 +215,37 @@ OS or by the library <c>hwloc</c> in case it is available.
 
				 
			
 
				 \fn enum starpu_worker_archtype starpu_worker_get_type(int id)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the type of processing unit associated to
			
 
				-a worker. The worker identifier is a value returned by the function
			
 
				-starpu_worker_get_id()). The returned value indicates the
			
 
				-architecture of the worker: ::STARPU_CPU_WORKER for a CPU core,
			
 
				-::STARPU_CUDA_WORKER for a CUDA device, and ::STARPU_OPENCL_WORKER for a
			
 
				-OpenCL device. The value returned for an invalid identifier is
			
 
				+Return the type of processing unit associated to the worker \p id. The
			
 
				+worker identifier is a value returned by the function
			
 
				+starpu_worker_get_id()). The returned value indicates the architecture
			
 
				+of the worker: ::STARPU_CPU_WORKER for a CPU core,
			
 
				+::STARPU_CUDA_WORKER for a CUDA device, and ::STARPU_OPENCL_WORKER for
			
 
				+a OpenCL device. The value returned for an invalid identifier is
			
 
				 unspecified.
			
 
				 
			
 
				 \fn void starpu_worker_get_name(int id, char *dst, size_t maxlen)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function allows to get the name of a given worker. StarPU
			
 
				-associates a unique human readable string to each processing unit.
			
 
				-This function copies at most the \p maxlen first bytes of the unique
			
 
				-string associated to a worker identified by its identifier \p id into the
			
 
				-\p dst buffer. The caller is responsible for ensuring that \p dst is a
			
 
				-valid pointer to a buffer of \p maxlen bytes at least. Calling this
			
 
				-function on an invalid identifier results in an unspecified behaviour.
			
 
				+Allow to get the name of the worker \p id. StarPU associates a unique
			
 
				+human readable string to each processing unit. This function copies at
			
 
				+most the \p maxlen first bytes of the unique string associated to the
			
 
				+worker \p id into the \p dst buffer. The caller is responsible for
			
 
				+ensuring that \p dst is a valid pointer to a buffer of \p maxlen bytes
			
 
				+at least. Calling this function on an invalid identifier results in an
			
 
				+unspecified behaviour.
			
 
				+
			
 
				+\fn void starpu_worker_display_names(FILE *output, enum starpu_worker_archtype type)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Display on \p output the list (if any) of all the workers of the given
			
 
				+\p type.
			
 
				 
			
 
				 \fn unsigned starpu_worker_get_memory_node(unsigned workerid)
			
 
				 \ingroup API_Workers_Properties
			
 
				-This function returns the identifier of the memory node
			
 
				-associated to the worker identified by \p workerid.
			
 
				+Return the identifier of the memory node associated to the worker
			
 
				+identified by \p workerid.
			
 
				 
			
 
				 \fn enum starpu_node_kind starpu_node_get_kind(unsigned node)
			
 
				 \ingroup API_Workers_Properties
			
 
				-Returns the type of the given node as defined by
			
 
				+Return the type of \p node as defined by
			
 
				 ::starpu_node_kind. For example, when defining a new data interface,
			
 
				 this function should be used in the allocation function to determine
			
 
				 on which device the memory needs to be allocated.
			
@@ -284,6 +262,6 @@ whose StarPU identifier is \p node.
 
				 
			
 
				 \fn char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
			
 
				 \ingroup API_Workers_Properties
			
 
				-Returns the given worker type as a string.
			
 
				+Return worker \p type as a string.
			
 
				 
			
 
				 */
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -207,6 +207,7 @@ STARPU_EXAMPLES +=				\
 
				 if !STARPU_SIMGRID
			
 
				 STARPU_EXAMPLES +=				\
			
 
				 	basic_examples/hello_world		\
			
 
				+	basic_examples/topology			\
			
 
				 	basic_examples/vector_scal		\
			
 
				 	basic_examples/mult			\
			
 
				 	basic_examples/block			\
			
--- a/examples/audio/starpu_audio_processing.c
+++ b/examples/audio/starpu_audio_processing.c
@@ -32,6 +32,7 @@
 
				 #include <fftw3.h>
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cufft.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 /* #define SAVE_RAW	1 */
			
@@ -215,7 +216,10 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
				 	STARPU_ASSERT(cures == CUFFT_SUCCESS);
			
 
				 
			
 
				 	/* FFTW does not normalize its output ! */
			
 
				-	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
			
 
				+	float scal = 1.0f/nsamples;
			
 
				+	cublasStatus_t status = cublasSscal (starpu_cublas_local_handle(), nsamples, &scal, localA, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -30,7 +30,7 @@
 
				 #include <common/blas.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 #include "axpy.h"
			
@@ -74,7 +74,9 @@ void axpy_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 
				 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 
			
 
				-	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
			
 
				+	cublasStatus_t status = CUBLASAXPY(starpu_cublas_get_local_handle(), (int)n, &alpha, block_x, 1, block_y, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/basic_examples/topology.c
+++ b/examples/basic_examples/topology.c
@@ -0,0 +1,34 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_worker_display_names(stdout, STARPU_CPU_WORKER);
			
 
				+	starpu_topology_print(stdout);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -21,7 +21,6 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				-#include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -22,6 +22,12 @@
 
				 #include <math.h>
			
 
				 #include <limits.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+static const TYPE p1 = 1.0;
			
 
				+static const TYPE m1 = -1.0;
			
 
				+#endif
			
 
				+
			
 
				 #if 0
			
 
				 static void print_vector_from_descr(unsigned nx, TYPE *v)
			
 
				 {
			
@@ -81,7 +87,9 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 
				 	TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				  
			
 
				-	cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
			
 
				+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), 1, &p1, v_src, 1, v_dst, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -119,8 +127,10 @@ static void accumulate_vector_cuda(void *descr[], void *cl_arg)
 
				 	TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				- 
			
 
				-	cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
			
 
				+
			
 
				+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p1, v_src, 1, v_dst, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -247,10 +257,26 @@ static void dot_kernel_cuda(void *descr[], void *cl_arg)
 
				 
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
			
 
				 
			
 
				-	/* Contrary to cublasSdot, this function puts its result directly in
			
 
				-	 * device memory, so that we don't have to transfer that value back and
			
 
				-	 * forth. */
			
 
				-	dot_host(v1, v2, n, dot);
			
 
				+	int version;
			
 
				+	cublasGetVersion(starpu_cublas_get_local_handle(), &version);
			
 
				+
			
 
				+	/* FIXME: check in Nvidia bug #1882017 when this gets fixed */
			
 
				+	if (version < 99999)
			
 
				+	{
			
 
				+		/* This function puts its result directly in device memory, so
			
 
				+		 * that we don't have to transfer that value back and forth. */
			
 
				+		dot_host(v1, v2, n, dot);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* Should be able to put result in GPU, but does not yet, see
			
 
				+		 * Nvidia bug #1882017 */
			
 
				+		cublasStatus_t status = cublasdot(starpu_cublas_get_local_handle(),
			
 
				+			n, v1, 1, v2, 1, dot);
			
 
				+		if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+			STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+	}
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -335,7 +361,9 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 
				  
			
 
				 	/* v1 = p1 v1 */
			
 
				 	TYPE alpha = p1;
			
 
				-	cublasscal(n, alpha, v1, 1);
			
 
				+	cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), n, &alpha, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -389,7 +417,10 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
				 	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
			
 
				 
			
 
				 	/* Compute v1 = alpha M v2 + beta v1 */
			
 
				-	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
			
 
				+	cublasStatus_t status = cublasgemv(starpu_cublas_get_local_handle(),
			
 
				+			CUBLAS_OP_N, nx, ny, &alpha, M, ld, v2, 1, &beta, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -504,8 +535,13 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 
				 	 *	v1 = p1 v1
			
 
				 	 *	v1 = v1 + p2 v2
			
 
				 	 */
			
 
				-	cublasscal(n, p1, v1, 1);
			
 
				-	cublasaxpy(n, p2, v2, 1, v1, 1);
			
 
				+	cublasStatus_t status;
			
 
				+	status = cublasscal(starpu_cublas_get_local_handle(), n, &p1, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p2, v2, 1, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -584,7 +620,10 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 
				  
			
 
				 	/* Compute v1 = v1 + p1 * v2.
			
 
				 	 */
			
 
				-	cublasaxpy(n, p1, v2, 1, v1, 1);
			
 
				+	cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
			
 
				+			n, &p1, v2, 1, v1, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -24,7 +24,6 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime.h>
			
 
				-#include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				 #include <common/blas.h>
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
@@ -47,7 +47,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				 	unsigned long nn = n/nblocks;
			
 
				 
			
 
				-	int prio_level = noprio_p?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				 
			
 
				 	if (bound_p || bound_lp_p || bound_mps_p)
			
 
				 		starpu_bound_start(bound_deps_p, 0);
			
@@ -62,7 +62,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				                 ret = starpu_task_insert(&cl11,
			
 
				-					 STARPU_PRIORITY, prio_level,
			
 
				+					 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? 2*nblocks - 2*k : STARPU_MAX_PRIO,
			
 
				 					 STARPU_RW, sdatakk,
			
 
				 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				 					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
			
@@ -76,7 +76,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				                         starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				                         ret = starpu_task_insert(&cl21,
			
 
				-						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? 2*nblocks - 2*k - j : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 						 STARPU_R, sdatakk,
			
 
				 						 STARPU_RW, sdatakj,
			
 
				 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
@@ -98,7 +98,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				 
			
 
				 					ret = starpu_task_insert(&cl22,
			
 
				-								 STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? 2*nblocks - 2*k - j - i : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				 								 STARPU_R, sdataki,
			
 
				 								 STARPU_R, sdatakj,
			
 
				 								 cl22.modes[2], sdataij,
			
@@ -139,11 +139,13 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 		{
			
 
				 			FILE *f = fopen("cholesky.lp", "w");
			
 
				 			starpu_bound_print_lp(f);
			
 
				+			fclose(f);
			
 
				 		}
			
 
				 		if (bound_mps_p)
			
 
				 		{
			
 
				 			FILE *f = fopen("cholesky.mps", "w");
			
 
				 			starpu_bound_print_mps(f);
			
 
				+			fclose(f);
			
 
				 		}
			
 
				 		if (bound_p)
			
 
				 		{
			
@@ -291,10 +293,10 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 				if (i <= j)
			
 
				 				{
			
 
				 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-	                                float err = fabsf(test_mat[j +i*size] - orig);
			
 
				+	                                float err = fabsf(test_mat[j +i*size] - orig) / orig;
			
 
				 	                                if (err > 0.00001)
			
 
				 					{
			
 
				-	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				 	                                        assert(0);
			
 
				 	                                }
			
 
				 	                        }
			
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -25,15 +25,24 @@
 
				 #include <starpu.h>
			
 
				 #include "cholesky.h"
			
 
				 #include "../common/blas.h"
			
 
				-#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
			
 
				+#if defined(STARPU_USE_CUDA)
			
 
				+#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+#if defined(STARPU_HAVE_MAGMA)
			
 
				 #include "magma.h"
			
 
				 #include "magma_lapack.h"
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  *   U22 
			
 
				  */
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA)
			
 
				+static const float p1 =  1.0;
			
 
				+static const float m1 = -1.0;
			
 
				+#endif
			
 
				+
			
 
				 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	/* printf("22\n"); */
			
@@ -78,12 +87,12 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 
				 	{
			
 
				 		/* CUDA kernel */
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#ifdef STARPU_HAVE_MAGMA
			
 
				-		cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				-#endif
			
 
				-		cublasSgemm('n', 't', dy, dx, dz, 
			
 
				-				-1.0f, left, ld21, right, ld12, 
			
 
				-				 1.0f, center, ld22);
			
 
				+		cublasStatus_t status = cublasSgemm(starpu_cublas_get_local_handle(),
			
 
				+				CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz, 
			
 
				+				&m1, left, ld21, right, ld12, 
			
 
				+				&p1, center, ld22);
			
 
				+		if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+			STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 #endif
			
 
				 
			
 
				 	}
			
@@ -120,6 +129,10 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
				 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -127,10 +140,11 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-#ifdef STARPU_HAVE_MAGMA
			
 
				-			cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				-#endif
			
 
				-			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			status = cublasStrsm(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
			
 
				+					nx21, ny21, &p1, sub11, ld11, sub21, ld21);
			
 
				+			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
@@ -202,9 +216,12 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 			{
			
 
				 			int ret;
			
 
				 			int info;
			
 
				+			cudaStream_t stream = starpu_cuda_get_local_stream();
			
 
				 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
			
 
				-			cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				-			magmablasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				+			cublasSetKernelStream(stream);
			
 
				+			magmablasSetKernelStream(stream);
			
 
				+#else
			
 
				+			starpu_cublas_set_stream();
			
 
				 #endif
			
 
				 			ret = magma_spotrf_gpu(MagmaLower, nx, sub11, ld, &info);
			
 
				 			if (ret != MAGMA_SUCCESS)
			
@@ -213,7 +230,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 				STARPU_ABORT();
			
 
				 			}
			
 
				 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
			
 
				-			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaError_t cures = cudaStreamSynchronize(stream);
			
 
				 #else
			
 
				 			cudaError_t cures = cudaThreadSynchronize();
			
 
				 #endif
			
@@ -223,29 +240,36 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 
				 			{
			
 
				 
			
 
				 			float *lambda11;
			
 
				+			cublasStatus_t status;
			
 
				+			cudaStream_t stream = starpu_cuda_get_local_stream();
			
 
				+			cublasHandle_t handle = starpu_cublas_get_local_handle();
			
 
				 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
			
 
				 
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				
			
 
				-				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
			
 
				+				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 				STARPU_ASSERT(*lambda11 != 0.0f);
			
 
				 				
			
 
				 				*lambda11 = sqrt(*lambda11);
			
 
				 
			
 
				 /*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
			
 
				-				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, stream);
			
 
				+				float scal = 1.0f/(*lambda11);
			
 
				 
			
 
				-				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
			
 
				+				status = cublasSscal(handle,
			
 
				+						nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
			
 
				 
			
 
				-				cublasSsyr('U', nx - z - 1, -1.0f,
			
 
				+				status = cublasSsyr(handle,
			
 
				+							CUBLAS_FILL_MODE_UPPER,
			
 
				+							nx - z - 1, &m1,
			
 
				 							&sub11[(z+1)+z*ld], 1,
			
 
				 							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				 			}
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaStreamSynchronize(stream);
			
 
				 			cudaFreeHost(lambda11);
			
 
				 			}
			
 
				 #endif
			
--- a/examples/heat/dw_factolu.c
+++ b/examples/heat/dw_factolu.c
@@ -72,8 +72,8 @@ static struct starpu_codelet cl12 =
 
				 	.cpu_funcs_name = {"dw_cpu_codelet_update_u12"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u12},
			
 
				-	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_RW},
			
 
				 	.model = &model_12
			
@@ -85,8 +85,8 @@ static struct starpu_codelet cl21 =
 
				 	.cpu_funcs_name = {"dw_cpu_codelet_update_u21"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u21},
			
 
				-	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_RW},
			
 
				 	.model = &model_21
			
@@ -98,8 +98,8 @@ static struct starpu_codelet cl22 =
 
				 	.cpu_funcs_name = {"dw_cpu_codelet_update_u22"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u22},
			
 
				-	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 3,
			
 
				 	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				 	.model = &model_22
			
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -25,7 +25,6 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime.h>
			
 
				-#include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				 #include "../common/blas.h"
			
--- a/examples/heat/dw_factolu_grain.c
+++ b/examples/heat/dw_factolu_grain.c
@@ -99,6 +99,7 @@ static struct starpu_codelet cl12 =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u12},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &model_12
			
 
				 };
			
@@ -144,6 +145,7 @@ static struct starpu_codelet cl21 =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u21},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &model_21
			
 
				 };
			
@@ -186,6 +188,7 @@ static struct starpu_codelet cl22 =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u22},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &model_22
			
 
				 };
			
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -20,6 +20,13 @@
 
				  */
			
 
				 #include "dw_factolu.h"
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+static const float p1 =  1.0;
			
 
				+static const float m1 = -1.0;
			
 
				+#endif
			
 
				+
			
 
				 unsigned count_11_per_worker[STARPU_NMAXWORKERS] = {0};
			
 
				 unsigned count_12_per_worker[STARPU_NMAXWORKERS] = {0};
			
 
				 unsigned count_21_per_worker[STARPU_NMAXWORKERS] = {0};
			
@@ -134,9 +141,10 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
			
 
				-					right, ld12, 1.0f, center, ld22);
			
 
				-			status = cublasGetError();
			
 
				+			status = cublasSgemm(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_OP_N, CUBLAS_OP_N,
			
 
				+					dx, dy, dz, &m1, left, ld21,
			
 
				+					right, ld12, &p1, center, ld22);
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -197,9 +205,10 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
			
 
				-					1.0f, sub11, ld11, sub12, ld12);
			
 
				-			status = cublasGetError();
			
 
				+			status = cublasStrsm(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
			
 
				+					ny12, nx12,
			
 
				+					&p1, sub11, ld11, sub12, ld12);
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -258,8 +267,9 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				-			status = cublasGetError();
			
 
				+			status = cublasStrsm(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
			
 
				+					ny21, nx21, &p1, sub11, ld11, sub21, ld21);
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -319,6 +329,12 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
				 
			
 
				 	unsigned long z;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cudaStream_t stream;
			
 
				+	cublasHandle_t handle;
			
 
				+	cublasStatus_t status;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -338,23 +354,28 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				+			stream = starpu_cuda_get_local_stream();
			
 
				+			handle = starpu_cublas_get_local_handle();
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				float pivot;
			
 
				-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
			
 
				+				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0f);
			
 
				+				float scal = 1.0f/pivot;
			
 
				 
			
 
				-				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
			
 
				+				status = cublasSscal(starpu_cublas_get_local_handle(),
			
 
				+						nx - z - 1, &scal, &sub11[z+(z+1)*ld], ld);
			
 
				 
			
 
				-				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
			
 
				+				status = cublasSger(starpu_cublas_get_local_handle(),
			
 
				+						nx - z - 1, nx - z - 1, &m1,
			
 
				 								&sub11[z+(z+1)*ld], ld,
			
 
				 								&sub11[(z+1)+z*ld], 1,
			
 
				 								&sub11[(z+1) + (z+1)*ld],ld);
			
 
				 			}
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaStreamSynchronize(stream);
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -241,6 +241,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 	struct starpu_task *task6 = create_task(maskiter | 6UL);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
			
 
				+	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				 #endif
			
 
				 	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
			
 
				 	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
			
@@ -259,6 +260,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 	struct starpu_task *task7 = create_task(maskiter | 7UL);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
			
 
				+	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				 #endif
			
 
				 	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
			
 
				 	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
			
@@ -292,6 +294,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 	struct starpu_task *task9 = create_task(maskiter | 9UL);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
			
 
				+	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				 #endif
			
 
				 	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
			
 
				 	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
			
--- a/examples/heat/dw_sparse_cg.h
+++ b/examples/heat/dw_sparse_cg.h
@@ -29,10 +29,6 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				-
			
 
				 #include "../common/blas.h"
			
 
				 
			
 
				 #define MAXITER	100000
			
--- a/examples/heat/dw_sparse_cg_kernels.c
+++ b/examples/heat/dw_sparse_cg_kernels.c
@@ -17,6 +17,10 @@
 
				 
			
 
				 #include "dw_sparse_cg.h"
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  *	Algorithm :
			
 
				  *		
			
@@ -146,7 +150,10 @@ void cublas_codelet_func_3(void *descr[], void *arg)
 
				 	vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	dot = cublasSdot (size, vec, 1, vec, 1);
			
 
				+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vec, 1, vec, 1, &dot);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	pb->delta_new = dot;
			
 
				 	pb->delta_0 = dot;
			
@@ -238,7 +245,10 @@ void cublas_codelet_func_5(void *descr[], void *arg)
 
				 	STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	dot = cublasSdot (size, vecd, 1, vecq, 1);
			
 
				+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecd, 1, vecq, 1, &dot);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	pb->alpha = pb->delta_new / dot;
			
 
				 }
			
@@ -281,7 +291,9 @@ void cublas_codelet_func_6(void *descr[], void *arg)
 
				 
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
			
 
				+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &pb->alpha, vecd, 1, vecx, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -320,7 +332,11 @@ void cublas_codelet_func_7(void *descr[], void *arg)
 
				 
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
			
 
				+	float scal = -pb->alpha;
			
 
				+
			
 
				+	cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecq, 1, vecr, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -363,7 +379,8 @@ void cublas_codelet_func_8(void *descr[], void *arg)
 
				 	vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				-	dot = cublasSdot (size, vecr, 1, vecr, 1);
			
 
				+	cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	pb->delta_old = pb->delta_new;
			
 
				 	pb->delta_new = dot;
			
@@ -412,9 +429,15 @@ void cublas_codelet_func_9(void *descr[], void *arg)
 
				 	size = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	/* d = beta d */
			
 
				-	cublasSscal(size, pb->beta, vecd, 1);
			
 
				+	cublasStatus_t status;
			
 
				+	status = cublasSscal(starpu_cublas_get_local_handle(), size, &pb->beta, vecd, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
 
				 	/* d = r + d */
			
 
				-	cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
			
 
				+	float scal = 1.0f;
			
 
				+	status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecr, 1, vecd, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
--- a/examples/lu/xlu.h
+++ b/examples/lu/xlu.h
@@ -20,9 +20,6 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <common/blas.h>
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				 
			
 
				 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
			
 
				 #define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
			
--- a/examples/lu/xlu_kernels.c
+++ b/examples/lu/xlu_kernels.c
@@ -21,6 +21,11 @@
 
				 #include <math.h>
			
 
				 #include <complex.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+#endif
			
 
				+
			
 
				 #define str(s) #s
			
 
				 #define xstr(s)        str(s)
			
 
				 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
			
@@ -65,11 +70,11 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				 		{
			
 
				-			CUBLAS_GEMM('n', 'n', dx, dy, dz,
			
 
				-				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
			
 
				-				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
			
 
				+			status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
			
 
				+				CUBLAS_OP_N, CUBLAS_OP_N, dx, dy, dz,
			
 
				+				(CUBLAS_TYPE *)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
			
 
				+				(CUBLAS_TYPE *)&p1, (CUBLAS_TYPE *)center, ld22);
			
 
				 
			
 
				-			status = cublasGetError();
			
 
				 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -185,10 +190,11 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
			
 
				-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
			
 
				+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
			
 
				+					ny12, nx12,
			
 
				+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
			
 
				 
			
 
				-			status = cublasGetError();
			
 
				 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -271,10 +277,11 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
			
 
				-					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
			
 
				+			status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
			
 
				+					ny21, nx21,
			
 
				+					(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
			
 
				 
			
 
				-			status = cublasGetError();
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				 				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
@@ -342,6 +349,12 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
				 
			
 
				 	unsigned long z;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cublasHandle_t handle;
			
 
				+	cudaStream_t stream;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -366,12 +379,14 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				+			handle = starpu_cublas_get_local_handle();
			
 
				+			stream = starpu_cuda_get_local_stream();
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				TYPE pivot;
			
 
				 				TYPE inv_pivot;
			
 
				-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
			
 
				+				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 #ifdef COMPLEX_LU
			
 
				 				STARPU_ASSERT(fpclassify(creal(pivot)) != FP_ZERO);
			
@@ -381,15 +396,23 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
				 #endif
			
 
				 				
			
 
				 				inv_pivot = 1.0/pivot;
			
 
				-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				+				status = CUBLAS_SCAL(handle,
			
 
				+						nx - z - 1,
			
 
				+						(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 				
			
 
				-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
			
 
				+				status = CUBLAS_GER(handle,
			
 
				+						nx - z - 1, nx - z - 1,
			
 
				+						(CUBLAS_TYPE*)&m1,
			
 
				 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
			
 
				 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
			
 
				 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			}
			
 
				 			
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaStreamSynchronize(stream);
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -458,6 +481,12 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
				 	unsigned *ipiv = piv->piv;
			
 
				 	unsigned first = piv->first;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cublasHandle_t handle;
			
 
				+	cudaStream_t stream;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -496,43 +525,63 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				+			handle = starpu_cublas_get_local_handle();
			
 
				+			stream = starpu_cuda_get_local_stream();
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				TYPE pivot;
			
 
				 				TYPE inv_pivot;
			
 
				-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
			
 
				+				cudaStreamSynchronize(stream);
			
 
				 
			
 
				 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
			
 
				 				{
			
 
				 					/* find the pivot */
			
 
				-					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
			
 
				+					int piv_ind;
			
 
				+					status = CUBLAS_IAMAX(handle,
			
 
				+						nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld, &piv_ind);
			
 
				+					piv_ind -= 1;
			
 
				+					if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+						STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 	
			
 
				 					ipiv[z + first] = piv_ind + z + first;
			
 
				 
			
 
				 					/* swap if needed */
			
 
				 					if (piv_ind != 0)
			
 
				 					{
			
 
				-						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
			
 
				+						status = CUBLAS_SWAP(handle,
			
 
				+							nx,
			
 
				+							(CUBLAS_TYPE*)&sub11[z*ld], 1,
			
 
				+							(CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
			
 
				+						if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+							STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 					}
			
 
				 
			
 
				-					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-					cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
			
 
				+					cudaStreamSynchronize(stream);
			
 
				 				}
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0);
			
 
				 				
			
 
				 				inv_pivot = 1.0/pivot;
			
 
				-				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				+				status = CUBLAS_SCAL(handle,
			
 
				+						nx - z - 1,
			
 
				+						(CUBLAS_TYPE*)&inv_pivot,
			
 
				+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+					STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 				
			
 
				-				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
			
 
				+				status = CUBLAS_GER(handle,
			
 
				+						nx - z - 1, nx - z - 1,
			
 
				+						(CUBLAS_TYPE*)&m1,
			
 
				 						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
			
 
				 						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
			
 
				 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
			
 
				-				
			
 
				+				if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+						STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			}
			
 
				 
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			cudaStreamSynchronize(stream);
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -600,6 +649,11 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 
				 	unsigned *ipiv = piv->piv;
			
 
				 	unsigned first = piv->first;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cublasHandle_t handle;
			
 
				+#endif
			
 
				+
			
 
				 	switch (s)
			
 
				 	{
			
 
				 		case 0:
			
@@ -614,12 +668,18 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				+			handle = starpu_cublas_get_local_handle();
			
 
				 			for (row = 0; row < nx; row++)
			
 
				 			{
			
 
				 				unsigned rowpiv = ipiv[row+first] - first;
			
 
				 				if (rowpiv != row)
			
 
				 				{
			
 
				-					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
			
 
				+					status = CUBLAS_SWAP(handle,
			
 
				+							nx,
			
 
				+							(CUBLAS_TYPE*)&matrix[row*ld], 1,
			
 
				+							(CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
			
 
				+					if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+						STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 				}
			
 
				 			}
			
 
				 
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -36,7 +36,10 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				-#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+static const TYPE p1 = 1.0;
			
 
				+static const TYPE m1 = -1.0;
			
 
				+static const TYPE v0 = 0.0;
			
 
				 #endif
			
 
				 
			
 
				 static unsigned niter = 10;
			
@@ -161,8 +164,13 @@ static void cublas_mult(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
 
				 	unsigned ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				 	unsigned ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				 
			
 
				-	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
			
 
				-				     (TYPE)0.0, subC, ldC);
			
 
				+	cublasStatus_t status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
			
 
				+			CUBLAS_OP_N, CUBLAS_OP_N,
			
 
				+			nxC, nyC, nyA,
			
 
				+			&p1, subA, ldA, subB, ldB,
			
 
				+			&v0, subC, ldC);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2012, 2013, 2014  CNRS
			
 
				- * Copyright (C) 2012, 2014, 2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2012, 2014, 2016-2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -35,7 +35,7 @@
 
				 #include <common/blas.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
@@ -100,9 +100,11 @@ void pipeline_cublas_axpy(void *descr[], void *arg)
 
				 	float *x = (float *) STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	float *y = (float *) STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 	int n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	float alpha = 1.;
			
 
				 
			
 
				-	cublasSaxpy(n, 1., x, 1, y, 1);
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+	cublasStatus_t status = cublasSaxpy(starpu_cublas_get_local_handle(), n, &alpha, x, 1, y, 1);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -118,6 +120,7 @@ static struct starpu_codelet pipeline_codelet_axpy =
 
				 	.cpu_funcs_name = {"pipeline_cpu_axpy"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {pipeline_cublas_axpy},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_RW},
			
@@ -143,9 +146,10 @@ void pipeline_cublas_sum(void *descr[], void *arg)
 
				 	int n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	float y;
			
 
				 
			
 
				-	y = cublasSasum(n, x, 1);
			
 
				+	cublasStatus_t status = cublasSasum(starpu_cublas_get_local_handle(), n, x, 1, &y);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	FPRINTF(stderr,"CUBLAS finished with %f\n", y);
			
 
				 }
			
 
				 #endif
			
@@ -162,6 +166,7 @@ static struct starpu_codelet pipeline_codelet_sum =
 
				 	.cpu_funcs_name = {"pipeline_cpu_sum"},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {pipeline_cublas_sum},
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_R},
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2012 INRIA
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -29,7 +29,7 @@
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				-#include <cublas.h>
			
 
				+#include <starpu_cublas_v2.h>
			
 
				 #endif
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
@@ -245,7 +245,7 @@ void dot_cpu_func(void *descr[], void *cl_arg)
 
				 void dot_cuda_func(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	DOT_TYPE current_dot;
			
 
				-	DOT_TYPE local_dot;
			
 
				+	float local_dot;
			
 
				 
			
 
				 	float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -256,7 +256,10 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
				 	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				-	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
			
 
				+	cublasStatus_t status = cublasSdot(starpu_cublas_get_local_handle(), n, local_x, 1, local_y, 1, &local_dot);
			
 
				+	if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+		STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
 
				 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
			
 
				 	current_dot += local_dot;
			
@@ -356,10 +359,16 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	/* cublasSdot has synchronization issues when using a non-blocking stream */
			
 
				-	cublasGetVersion(&cublas_version);
			
 
				+	cublasHandle_t handle;
			
 
				+	cublasCreate(&handle);
			
 
				+	cublasGetVersion(handle, &cublas_version);
			
 
				+	cublasDestroy(handle);
			
 
				 	if (cublas_version >= 7050)
			
 
				 		starpu_cublas_init();
			
 
				+	else
			
 
				+		/* Disable the sdot cublas kernel, it is bogus with a
			
 
				+		 * non-blocking stream (Nvidia bugid 1669886) */
			
 
				+		dot_codelet.cuda_funcs[0] = NULL;
			
 
				 #endif
			
 
				 
			
 
				 	unsigned long nelems = _nblocks*_entries_per_block;
			
@@ -446,10 +455,13 @@ int main(int argc, char **argv)
 
				 	if (fabs(reference_dot - _dot) < reference_dot * 1e-6)
			
 
				 		return EXIT_SUCCESS;
			
 
				 	else
			
 
				+	{
			
 
				+		FPRINTF(stderr, "ERROR: fabs(%e - %e) >= %e * 1e-6\n", reference_dot, _dot, reference_dot);
			
 
				 		return EXIT_FAILURE;
			
 
				+	}
			
 
				 
			
 
				 enodev:
			
 
				-	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				 	return 77;
			
--- a/examples/sched_ctx/gpu_partition.c
+++ b/examples/sched_ctx/gpu_partition.c
@@ -26,10 +26,6 @@
 
				 
			
 
				 #include <common/blas.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				-
			
 
				 
			
 
				 #define N	512*512
			
 
				 #define NITER   100
			
--- a/examples/sched_ctx/parallel_code.c
+++ b/examples/sched_ctx/parallel_code.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2014, 2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2014, 2016-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -97,5 +97,5 @@ int main(int argc, char **argv)
 
				 
			
 
				 	free(procs1);
			
 
				 
			
 
				-	return (ret == -ENODEV ? 77 : 0);
			
 
				+	return 0;
			
 
				 }
			
--- a/examples/spmv/dw_block_spmv.c
+++ b/examples/spmv/dw_block_spmv.c
@@ -167,6 +167,7 @@ struct starpu_codelet cl =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {cublas_block_spmv},
			
 
				 #endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				 	.nbuffers = 3,
			
 
				 	.modes = {STARPU_R, STARPU_R, STARPU_RW}
			
 
				 };
			
@@ -320,6 +321,7 @@ int main(STARPU_ATTRIBUTE_UNUSED int argc,
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	sem_init(&sem, 0, 0U);
			
 
				 
			
--- a/examples/spmv/dw_block_spmv.h
+++ b/examples/spmv/dw_block_spmv.h
@@ -28,10 +28,6 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				-
			
 
				 void cpu_block_spmv(void *descr[], void *_args);
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/examples/spmv/dw_block_spmv_kernels.c
+++ b/examples/spmv/dw_block_spmv_kernels.c
@@ -24,6 +24,12 @@
 
				  *   U22 
			
 
				  */
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <starpu_cublas_v2.h>
			
 
				+static const float p1 =  1.0;
			
 
				+static const float m1 = -1.0;
			
 
				+#endif
			
 
				+
			
 
				 static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	/* printf("22\n"); */
			
@@ -43,7 +49,10 @@ static inline void common_block_spmv(void *descr[], int s, STARPU_ATTRIBUTE_UNUS
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				-			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
			
 
				+			cublasStatus_t status = cublasSgemv (starpu_cublas_get_local_handle(),
			
 
				+					CUBLAS_OP_T, dx, dy, &p1, block, ld, in, 1, &p1, out, 1);
			
 
				+			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
--- a/include/starpu_cublas.h
+++ b/include/starpu_cublas.h
@@ -24,6 +24,7 @@ extern "C"
 
				 #endif
			
 
				 
			
 
				 void starpu_cublas_init(void);
			
 
				+void starpu_cublas_set_stream(void);
			
 
				 void starpu_cublas_shutdown(void);
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/include/starpu_cublas_v2.h
+++ b/include/starpu_cublas_v2.h
@@ -0,0 +1,34 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_CUBLAS_V2_H__
			
 
				+#define __STARPU_CUBLAS_V2_H__
			
 
				+
			
 
				+#include <cublas_v2.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+cublasHandle_t starpu_cublas_get_local_handle(void);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_CUBLAS_V2_H__ */
			
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -43,6 +43,7 @@ struct starpu_fxt_options
 
				 	unsigned no_counter;
			
 
				 	unsigned no_bus;
			
 
				 	unsigned ninputfiles;
			
 
				+	unsigned no_smooth;
			
 
				 	char *filenames[STARPU_FXT_MAX_FILES];
			
 
				 	char *out_paje_path;
			
 
				 	char *distrib_time_path;
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -239,6 +239,7 @@ static __starpu_inline unsigned long starpu_atomic_##name##l(unsigned long *ptr,
 
				 	return expr; \
			
 
				 }
			
 
				 
			
 
				+/* Returns the new value */
			
 
				 #ifdef STARPU_HAVE_SYNC_FETCH_AND_ADD
			
 
				 #define STARPU_ATOMIC_ADD(ptr, value)  (__sync_fetch_and_add ((ptr), (value)) + (value))
			
 
				 #define STARPU_ATOMIC_ADDL(ptr, value)  (__sync_fetch_and_add ((ptr), (value)) + (value))
			
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2013, 2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010-2014  CNRS
			
 
				+ * Copyright (C) 2010-2014, 2017  CNRS
			
 
				  * Copyright (C) 2016, 2017  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
@@ -117,6 +117,8 @@ int starpu_worker_get_by_devid(enum starpu_worker_archtype type, int devid);
 
				 
			
 
				 void starpu_worker_get_name(int id, char *dst, size_t maxlen);
			
 
				 
			
 
				+void starpu_worker_display_names(FILE *output, enum starpu_worker_archtype type);
			
 
				+
			
 
				 int starpu_worker_get_devid(int id);
			
 
				 
			
 
				 int starpu_worker_get_mp_nodeid(int id);
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -1679,40 +1679,41 @@ void _starpu_mpi_progress_shutdown(int *value)
 
				         STARPU_PTHREAD_COND_DESTROY(&barrier_cond);
			
 
				 }
			
 
				 
			
 
				-void _starpu_mpi_clear_cache(starpu_data_handle_t data_handle)
			
 
				+void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	_starpu_mpi_tag_data_release(data_handle);
			
 
				-	struct _starpu_mpi_node_tag *mpi_data = data_handle->mpi_data;
			
 
				-	_starpu_mpi_cache_flush(mpi_data->comm, data_handle);
			
 
				+	_starpu_mpi_cache_data_clear(data_handle);
			
 
				 	free(data_handle->mpi_data);
			
 
				 }
			
 
				 
			
 
				 void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, int tag, int rank, MPI_Comm comm)
			
 
				 {
			
 
				-	struct _starpu_mpi_node_tag *mpi_data;
			
 
				+	struct _starpu_mpi_data *mpi_data;
			
 
				 	if (data_handle->mpi_data)
			
 
				 	{
			
 
				 		mpi_data = data_handle->mpi_data;
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_STARPU_CALLOC(mpi_data, 1, sizeof(struct _starpu_mpi_node_tag));
			
 
				-		mpi_data->data_tag = -1;
			
 
				-		mpi_data->rank = -1;
			
 
				-		mpi_data->comm = MPI_COMM_WORLD;
			
 
				+		_STARPU_CALLOC(mpi_data, 1, sizeof(struct _starpu_mpi_data));
			
 
				+		mpi_data->magic = 42;
			
 
				+		mpi_data->node_tag.data_tag = -1;
			
 
				+		mpi_data->node_tag.rank = -1;
			
 
				+		mpi_data->node_tag.comm = MPI_COMM_WORLD;
			
 
				 		data_handle->mpi_data = mpi_data;
			
 
				+		_starpu_mpi_cache_data_init(data_handle);
			
 
				 		_starpu_mpi_tag_data_register(data_handle, tag);
			
 
				-		_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_clear_cache);
			
 
				+		_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_data_clear);
			
 
				 	}
			
 
				 
			
 
				 	if (tag != -1)
			
 
				 	{
			
 
				-		mpi_data->data_tag = tag;
			
 
				+		mpi_data->node_tag.data_tag = tag;
			
 
				 	}
			
 
				 	if (rank != -1)
			
 
				 	{
			
 
				-		mpi_data->rank = rank;
			
 
				-		mpi_data->comm = comm;
			
 
				+		mpi_data->node_tag.rank = rank;
			
 
				+		mpi_data->node_tag.comm = comm;
			
 
				 		_starpu_mpi_comm_register(comm);
			
 
				 	}
			
 
				 }
			
@@ -1730,13 +1731,13 @@ void starpu_mpi_data_set_tag(starpu_data_handle_t handle, int tag)
 
				 int starpu_mpi_data_get_rank(starpu_data_handle_t data)
			
 
				 {
			
 
				 	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
			
 
				-	return ((struct _starpu_mpi_node_tag *)(data->mpi_data))->rank;
			
 
				+	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.rank;
			
 
				 }
			
 
				 
			
 
				 int starpu_mpi_data_get_tag(starpu_data_handle_t data)
			
 
				 {
			
 
				 	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
			
 
				-	return ((struct _starpu_mpi_node_tag *)(data->mpi_data))->data_tag;
			
 
				+	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
			
 
				 }
			
 
				 
			
 
				 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
			
@@ -1760,8 +1761,8 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
				 	if (me == node)
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
			
 
				-		void *already_received = _starpu_mpi_cache_received_data_set(data_handle, rank);
			
 
				-		if (already_received == NULL)
			
 
				+		int already_received = _starpu_mpi_cache_received_data_set(data_handle);
			
 
				+		if (already_received == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
			
 
				 			starpu_mpi_irecv_detached(data_handle, rank, tag, comm, callback, arg);
			
@@ -1770,8 +1771,8 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
				 	else if (me == rank)
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
			
 
				-		void *already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
			
 
				-		if (already_sent == NULL)
			
 
				+		int already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
			
 
				+		if (already_sent == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
			
 
				 			starpu_mpi_isend_detached(data_handle, node, tag, comm, NULL, NULL);
			
@@ -1801,8 +1802,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
				 	{
			
 
				 		MPI_Status status;
			
 
				 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
			
 
				-		void *already_received = _starpu_mpi_cache_received_data_set(data_handle, rank);
			
 
				-		if (already_received == NULL)
			
 
				+		int already_received = _starpu_mpi_cache_received_data_set(data_handle);
			
 
				+		if (already_received == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
			
 
				 			starpu_mpi_recv(data_handle, rank, tag, comm, &status);
			
@@ -1811,8 +1812,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
				 	else if (me == rank)
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
			
 
				-		void *already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
			
 
				-		if (already_sent == NULL)
			
 
				+		int already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
			
 
				+		if (already_sent == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
			
 
				 			starpu_mpi_send(data_handle, node, tag, comm);
			
--- a/mpi/src/starpu_mpi_cache.c
+++ b/mpi/src/starpu_mpi_cache.c
@@ -18,6 +18,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <common/uthash.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				 
			
 
				 #include <starpu_mpi_cache.h>
			
 
				 #include <starpu_mpi_cache_stats.h>
			
@@ -27,16 +28,14 @@
 
				 struct _starpu_data_entry
			
 
				 {
			
 
				 	UT_hash_handle hh;
			
 
				-	starpu_data_handle_t data;
			
 
				+	starpu_data_handle_t data_handle;
			
 
				 };
			
 
				 
			
 
				-static starpu_pthread_mutex_t *_cache_sent_mutex;
			
 
				-static starpu_pthread_mutex_t *_cache_received_mutex;
			
 
				-static struct _starpu_data_entry **_cache_sent_data = NULL;
			
 
				-static struct _starpu_data_entry **_cache_received_data = NULL;
			
 
				+static starpu_pthread_mutex_t _cache_mutex;
			
 
				+static struct _starpu_data_entry *_cache_data = NULL;
			
 
				 int _starpu_cache_enabled=1;
			
 
				-MPI_Comm _starpu_cache_comm;
			
 
				-int _starpu_cache_comm_size;
			
 
				+static MPI_Comm _starpu_cache_comm;
			
 
				+static int _starpu_cache_comm_size;
			
 
				 
			
 
				 int starpu_mpi_cache_is_enabled()
			
 
				 {
			
@@ -55,7 +54,7 @@ int starpu_mpi_cache_set(int enabled)
 
				 		{
			
 
				 			// We need to clean the cache
			
 
				 			starpu_mpi_cache_flush_all_data(_starpu_cache_comm);
			
 
				-			_starpu_mpi_cache_shutdown(_starpu_cache_comm_size);
			
 
				+			_starpu_mpi_cache_shutdown();
			
 
				 		}
			
 
				 		_starpu_cache_enabled = 0;
			
 
				 	}
			
@@ -64,8 +63,6 @@ int starpu_mpi_cache_set(int enabled)
 
				 
			
 
				 void _starpu_mpi_cache_init(MPI_Comm comm)
			
 
				 {
			
 
				-	int i;
			
 
				-
			
 
				 	_starpu_cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
			
 
				 	if (_starpu_cache_enabled == -1)
			
 
				 	{
			
@@ -80,295 +77,301 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 
				 
			
 
				 	_starpu_cache_comm = comm;
			
 
				 	starpu_mpi_comm_size(comm, &_starpu_cache_comm_size);
			
 
				-	_STARPU_MPI_DEBUG(2, "Initialising htable for cache\n");
			
 
				+	_starpu_mpi_cache_stats_init();
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&_cache_mutex, NULL);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_cache_shutdown()
			
 
				+{
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	_STARPU_MPI_MALLOC(_cache_sent_data, _starpu_cache_comm_size * sizeof(struct _starpu_data_entry *));
			
 
				-	_STARPU_MPI_MALLOC(_cache_received_data, _starpu_cache_comm_size * sizeof(struct _starpu_data_entry *));
			
 
				-	_STARPU_MPI_MALLOC(_cache_sent_mutex, _starpu_cache_comm_size * sizeof(starpu_pthread_mutex_t));
			
 
				-	_STARPU_MPI_MALLOC(_cache_received_mutex, _starpu_cache_comm_size * sizeof(starpu_pthread_mutex_t));
			
 
				+	struct _starpu_data_entry *entry, *tmp;
			
 
				 
			
 
				-	for(i=0 ; i<_starpu_cache_comm_size ; i++)
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				+	HASH_ITER(hh, _cache_data, entry, tmp)
			
 
				 	{
			
 
				-		_cache_sent_data[i] = NULL;
			
 
				-		_cache_received_data[i] = NULL;
			
 
				-		STARPU_PTHREAD_MUTEX_INIT(&_cache_sent_mutex[i], NULL);
			
 
				-		STARPU_PTHREAD_MUTEX_INIT(&_cache_received_mutex[i], NULL);
			
 
				+		HASH_DEL(_cache_data, entry);
			
 
				+		free(entry);
			
 
				 	}
			
 
				-	_starpu_mpi_cache_stats_init(comm);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				+	STARPU_PTHREAD_MUTEX_DESTROY(&_cache_mutex);
			
 
				+	free(_cache_data);
			
 
				+	_starpu_mpi_cache_stats_shutdown();
			
 
				 }
			
 
				 
			
 
				-static
			
 
				-void _starpu_mpi_cache_empty_tables(int world_size)
			
 
				+void _starpu_mpi_cache_data_clear(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	int i;
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(2, "Clearing htable for cache\n");
			
 
				-
			
 
				-	for(i=0 ; i<world_size ; i++)
			
 
				+	_starpu_mpi_cache_flush(data_handle);
			
 
				+	for(i=0 ; i<_starpu_cache_comm_size ; i++)
			
 
				 	{
			
 
				-		struct _starpu_data_entry *entry, *tmp;
			
 
				-
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
			
 
				-		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
			
 
				-		{
			
 
				-			HASH_DEL(_cache_sent_data[i], entry);
			
 
				-			free(entry);
			
 
				-		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
			
 
				-
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
			
 
				-		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
			
 
				-		{
			
 
				-			HASH_DEL(_cache_received_data[i], entry);
			
 
				-			_starpu_mpi_cache_stats_dec(i, entry->data);
			
 
				-			free(entry);
			
 
				-		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
			
 
				+		STARPU_PTHREAD_MUTEX_DESTROY(&mpi_data->cache_sent_mutex[i]);
			
 
				 	}
			
 
				+	free(mpi_data->cache_sent);
			
 
				+	free(mpi_data->cache_sent_mutex);
			
 
				 }
			
 
				 
			
 
				-void _starpu_mpi_cache_shutdown()
			
 
				+void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	int i;
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	_starpu_mpi_cache_empty_tables(_starpu_cache_comm_size);
			
 
				-	free(_cache_sent_data);
			
 
				-	free(_cache_received_data);
			
 
				-
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&mpi_data->cache_received_mutex, NULL);
			
 
				+	mpi_data->cache_received = 0;
			
 
				+	_STARPU_MALLOC(mpi_data->cache_sent_mutex, _starpu_cache_comm_size*sizeof(mpi_data->cache_sent_mutex[0]));
			
 
				+	_STARPU_MALLOC(mpi_data->cache_sent, _starpu_cache_comm_size*sizeof(mpi_data->cache_sent[0]));
			
 
				 	for(i=0 ; i<_starpu_cache_comm_size ; i++)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_MUTEX_DESTROY(&_cache_sent_mutex[i]);
			
 
				-		STARPU_PTHREAD_MUTEX_DESTROY(&_cache_received_mutex[i]);
			
 
				+		STARPU_PTHREAD_MUTEX_INIT(&mpi_data->cache_sent_mutex[i], NULL);
			
 
				+		mpi_data->cache_sent[i] = 0;
			
 
				 	}
			
 
				-	free(_cache_sent_mutex);
			
 
				-	free(_cache_received_mutex);
			
 
				-
			
 
				-	_starpu_mpi_cache_stats_shutdown();
			
 
				 }
			
 
				 
			
 
				-void _starpu_mpi_cache_sent_data_clear(MPI_Comm comm, starpu_data_handle_t data)
			
 
				+static void _starpu_mpi_cache_data_add(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				-	int n, size;
			
 
				-	starpu_mpi_comm_size(comm, &size);
			
 
				+	struct _starpu_data_entry *entry;
			
 
				 
			
 
				-	for(n=0 ; n<size ; n++)
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				+	HASH_FIND_PTR(_cache_data, &data_handle, entry);
			
 
				+	if (entry == NULL)
			
 
				 	{
			
 
				-		struct _starpu_data_entry *already_sent;
			
 
				+		_STARPU_MPI_MALLOC(entry, sizeof(*entry));
			
 
				+		entry->data_handle = data_handle;
			
 
				+		HASH_ADD_PTR(_cache_data, data_handle, entry);
			
 
				+	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				+}
			
 
				 
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[n]);
			
 
				-		HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
			
 
				-		if (already_sent)
			
 
				-		{
			
 
				-			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data);
			
 
				-			HASH_DEL(_cache_sent_data[n], already_sent);
			
 
				-			free(already_sent);
			
 
				-		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[n]);
			
 
				+static void _starpu_mpi_cache_data_remove(starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	struct _starpu_data_entry *entry;
			
 
				+
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				+	HASH_FIND_PTR(_cache_data, &data_handle, entry);
			
 
				+	if (entry)
			
 
				+	{
			
 
				+		HASH_DEL(_cache_data, entry);
			
 
				+		free(entry);
			
 
				 	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				-void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data)
			
 
				+/**************************************
			
 
				+ * Received cache
			
 
				+ **************************************/
			
 
				+void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				-	int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				-	struct _starpu_data_entry *already_received;
			
 
				+	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				+
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				+	STARPU_ASSERT(mpi_data->magic == 42);
			
 
				 	STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
			
 
				-	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
			
 
				-	if (already_received)
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_received_mutex);
			
 
				+	if (mpi_data->cache_received == 1)
			
 
				 	{
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #  warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
			
 
				 #endif
			
 
				-		_STARPU_MPI_DEBUG(2, "Clearing receive cache for data %p\n", data);
			
 
				-		HASH_DEL(_cache_received_data[mpi_rank], already_received);
			
 
				-		_starpu_mpi_cache_stats_dec(mpi_rank, data);
			
 
				-		free(already_received);
			
 
				-		starpu_data_invalidate_submit(data);
			
 
				+		_STARPU_MPI_DEBUG(2, "Clearing receive cache for data %p\n", data_handle);
			
 
				+		mpi_data->cache_received = 0;
			
 
				+		starpu_data_invalidate_submit(data_handle);
			
 
				+		_starpu_mpi_cache_data_remove(data_handle);
			
 
				+		_starpu_mpi_cache_stats_dec(mpi_rank, data_handle);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_received_mutex);
			
 
				 }
			
 
				 
			
 
				-void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
			
 
				+int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				-	int nb_nodes, i;
			
 
				-	int mpi_rank, my_rank;
			
 
				+	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 
			
 
				-	if (_starpu_cache_enabled == 0) return;
			
 
				+	if (_starpu_cache_enabled == 0) return 0;
			
 
				 
			
 
				-	starpu_mpi_comm_size(comm, &nb_nodes);
			
 
				-	starpu_mpi_comm_rank(comm, &my_rank);
			
 
				+	STARPU_ASSERT(mpi_data->magic == 42);
			
 
				+	STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
			
 
				 
			
 
				-	for(i=0 ; i<nb_nodes ; i++)
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_received_mutex);
			
 
				+	int already_received = mpi_data->cache_received;
			
 
				+	if (already_received == 0)
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been received by %d\n", data_handle, mpi_rank);
			
 
				+		mpi_data->cache_received = 1;
			
 
				+		_starpu_mpi_cache_data_add(data_handle);
			
 
				+		_starpu_mpi_cache_stats_inc(mpi_rank, data_handle);
			
 
				+	}
			
 
				+	else
			
 
				 	{
			
 
				-		struct _starpu_data_entry *entry, *tmp;
			
 
				+		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data_handle, mpi_rank);
			
 
				+	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_received_mutex);
			
 
				+	return already_received;
			
 
				+}
			
 
				 
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
			
 
				-		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
			
 
				-		{
			
 
				-			mpi_rank = starpu_mpi_data_get_rank(entry->data);
			
 
				-			if (mpi_rank != my_rank && mpi_rank != -1)
			
 
				-				starpu_data_invalidate_submit(entry->data);
			
 
				-			HASH_DEL(_cache_sent_data[i], entry);
			
 
				-			free(entry);
			
 
				-		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
			
 
				+int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	int already_received;
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
			
 
				-		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
			
 
				-		{
			
 
				-			mpi_rank = starpu_mpi_data_get_rank(entry->data);
			
 
				-			if (mpi_rank != my_rank && mpi_rank != -1)
			
 
				-				starpu_data_invalidate_submit(entry->data);
			
 
				-			HASH_DEL(_cache_received_data[i], entry);
			
 
				-			_starpu_mpi_cache_stats_dec(i, entry->data);
			
 
				-			free(entry);
			
 
				-		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
			
 
				-	}
			
 
				+	if (_starpu_cache_enabled == 0) return 0;
			
 
				+
			
 
				+	STARPU_ASSERT(mpi_data->magic == 42);
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_received_mutex);
			
 
				+	already_received = mpi_data->cache_received;
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_received_mutex);
			
 
				+	return already_received;
			
 
				 }
			
 
				 
			
 
				-void _starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+int starpu_mpi_cached_receive(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				-	struct _starpu_data_entry *avail;
			
 
				-	int i, nb_nodes;
			
 
				+	return _starpu_mpi_cache_received_data_get(data_handle);
			
 
				+}
			
 
				+
			
 
				+/**************************************
			
 
				+ * Send cache
			
 
				+ **************************************/
			
 
				+void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	int n, size;
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 
			
 
				 	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	starpu_mpi_comm_size(comm, &nb_nodes);
			
 
				-	for(i=0 ; i<nb_nodes ; i++)
			
 
				+	starpu_mpi_comm_size(mpi_data->node_tag.comm, &size);
			
 
				+	for(n=0 ; n<size ; n++)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[i]);
			
 
				-		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
			
 
				-		if (avail)
			
 
				-		{
			
 
				-			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
			
 
				-			HASH_DEL(_cache_sent_data[i], avail);
			
 
				-			free(avail);
			
 
				-		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[i]);
			
 
				-
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[i]);
			
 
				-		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
			
 
				-		if (avail)
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_sent_mutex[n]);
			
 
				+		if (mpi_data->cache_sent[n] == 1)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
			
 
				-			HASH_DEL(_cache_received_data[i], avail);
			
 
				-			_starpu_mpi_cache_stats_dec(i, data_handle);
			
 
				-			free(avail);
			
 
				+			mpi_data->cache_sent[n] = 0;
			
 
				+			_starpu_mpi_cache_data_remove(data_handle);
			
 
				 		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[i]);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_sent_mutex[n]);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				-{
			
 
				-	int my_rank, mpi_rank;
			
 
				-
			
 
				-	_starpu_mpi_cache_flush(comm, data_handle);
			
 
				-
			
 
				-	starpu_mpi_comm_rank(comm, &my_rank);
			
 
				-	mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				-	if (mpi_rank != my_rank && mpi_rank != -1)
			
 
				-		starpu_data_invalidate_submit(data_handle);
			
 
				-}
			
 
				-
			
 
				-void *_starpu_mpi_cache_received_data_set(starpu_data_handle_t data, int mpi_rank)
			
 
				+int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
			
 
				 {
			
 
				-	struct _starpu_data_entry *already_received;
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 
			
 
				-	if (_starpu_cache_enabled == 0) return NULL;
			
 
				+	if (_starpu_cache_enabled == 0) return 0;
			
 
				 
			
 
				-	STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
			
 
				+	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
			
 
				-	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
			
 
				-	if (already_received == NULL)
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_sent_mutex[dest]);
			
 
				+	int already_sent = mpi_data->cache_sent[dest];
			
 
				+	if (mpi_data->cache_sent[dest] == 0)
			
 
				 	{
			
 
				-		struct _starpu_data_entry *entry;
			
 
				-		_STARPU_MPI_MALLOC(entry, sizeof(*entry));
			
 
				-		entry->data = data;
			
 
				-		HASH_ADD_PTR(_cache_received_data[mpi_rank], data, entry);
			
 
				-		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been received by %d\n", data, mpi_rank);
			
 
				-		_starpu_mpi_cache_stats_inc(mpi_rank, data);
			
 
				+		mpi_data->cache_sent[dest] = 1;
			
 
				+		_starpu_mpi_cache_data_add(data_handle);
			
 
				+		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been sent to %d\n", data_handle, dest);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
			
 
				+		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data_handle, dest);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
			
 
				-	return already_received;
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_sent_mutex[dest]);
			
 
				+	return already_sent;
			
 
				 }
			
 
				 
			
 
				-void *_starpu_mpi_cache_received_data_get(starpu_data_handle_t data, int mpi_rank)
			
 
				+int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data_handle, int dest)
			
 
				 {
			
 
				-	struct _starpu_data_entry *already_received;
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				+	int already_sent;
			
 
				 
			
 
				-	if (_starpu_cache_enabled == 0) return NULL;
			
 
				+	if (_starpu_cache_enabled == 0) return 0;
			
 
				 
			
 
				-	STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
			
 
				+	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&_cache_received_mutex[mpi_rank]);
			
 
				-	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_received_mutex[mpi_rank]);
			
 
				-	return already_received;
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_sent_mutex[dest]);
			
 
				+	already_sent = mpi_data->cache_sent[dest];
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_sent_mutex[dest]);
			
 
				+	return already_sent;
			
 
				 }
			
 
				 
			
 
				-int starpu_mpi_cached_receive(starpu_data_handle_t data_handle)
			
 
				+int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
			
 
				 {
			
 
				-	int owner = starpu_mpi_data_get_rank(data_handle);
			
 
				-	void *already_received = _starpu_mpi_cache_received_data_get(data_handle, owner);
			
 
				-	return already_received != NULL;
			
 
				+	return _starpu_mpi_cache_sent_data_get(data_handle, dest);
			
 
				 }
			
 
				 
			
 
				-void *_starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest)
			
 
				+void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				-	struct _starpu_data_entry *already_sent;
			
 
				-
			
 
				-	if (_starpu_cache_enabled == 0) return NULL;
			
 
				+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				+	int i, nb_nodes;
			
 
				 
			
 
				-	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[dest]);
			
 
				-	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
			
 
				-	if (already_sent == NULL)
			
 
				+	starpu_mpi_comm_size(mpi_data->node_tag.comm, &nb_nodes);
			
 
				+	for(i=0 ; i<nb_nodes ; i++)
			
 
				 	{
			
 
				-		struct _starpu_data_entry *entry;
			
 
				-		_STARPU_MPI_MALLOC(entry, sizeof(*entry));
			
 
				-		entry->data = data;
			
 
				-		HASH_ADD_PTR(_cache_sent_data[dest], data, entry);
			
 
				-		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been sent to %d\n", data, dest);
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_sent_mutex[i]);
			
 
				+		if (mpi_data->cache_sent[i] == 1)
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
			
 
				+			mpi_data->cache_sent[i] = 0;
			
 
				+			_starpu_mpi_cache_stats_dec(i, data_handle);
			
 
				+		}
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_sent_mutex[i]);
			
 
				 	}
			
 
				-	else
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_data->cache_received_mutex);
			
 
				+	if (mpi_data->cache_received == 1)
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data, dest);
			
 
				+		int mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				+		_STARPU_MPI_DEBUG(2, "Clearing received cache for data %p\n", data_handle);
			
 
				+		mpi_data->cache_received = 0;
			
 
				+		_starpu_mpi_cache_stats_dec(mpi_rank, data_handle);
			
 
				 	}
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[dest]);
			
 
				-	return already_sent;
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_data->cache_received_mutex);
			
 
				 }
			
 
				 
			
 
				-void *_starpu_mpi_cache_sent_data_get(starpu_data_handle_t data, int dest)
			
 
				+static void _starpu_mpi_cache_flush_and_invalidate(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 {
			
 
				-	struct _starpu_data_entry *already_sent;
			
 
				-
			
 
				-	if (_starpu_cache_enabled == 0) return NULL;
			
 
				+	int my_rank, mpi_rank;
			
 
				 
			
 
				-	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
			
 
				+	_starpu_mpi_cache_flush(data_handle);
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&_cache_sent_mutex[dest]);
			
 
				-	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_sent_mutex[dest]);
			
 
				-	return already_sent;
			
 
				+	starpu_mpi_comm_rank(comm, &my_rank);
			
 
				+	mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				+	if (mpi_rank != my_rank && mpi_rank != -1)
			
 
				+		starpu_data_invalidate_submit(data_handle);
			
 
				 }
			
 
				 
			
 
				-int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
			
 
				+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 {
			
 
				-	void *already_sent = _starpu_mpi_cache_sent_data_get(data_handle, dest);
			
 
				-	return already_sent != NULL;
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	_starpu_mpi_cache_flush_and_invalidate(comm, data_handle);
			
 
				+	_starpu_mpi_cache_data_remove(data_handle);
			
 
				 }
			
 
				 
			
 
				+void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
			
 
				+{
			
 
				+	struct _starpu_data_entry *entry, *tmp;
			
 
				+
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
			
 
				+	HASH_ITER(hh, _cache_data, entry, tmp)
			
 
				+	{
			
 
				+		_starpu_mpi_cache_flush_and_invalidate(comm, entry->data_handle);
			
 
				+		HASH_DEL(_cache_data, entry);
			
 
				+		free(entry);
			
 
				+	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				+}
			
--- a/mpi/src/starpu_mpi_cache.h
+++ b/mpi/src/starpu_mpi_cache.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011-2014, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2014 INRIA
			
 
				  *
			
@@ -30,24 +30,26 @@ extern "C" {
 
				 extern int _starpu_cache_enabled;
			
 
				 void _starpu_mpi_cache_init(MPI_Comm comm);
			
 
				 void _starpu_mpi_cache_shutdown();
			
 
				+void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle);
			
 
				+void _starpu_mpi_cache_data_clear(starpu_data_handle_t data_handle);
			
 
				 
			
 
				 /*
			
 
				  * If the data is already available in the cache, return a pointer to the data
			
 
				  * If the data is NOT available in the cache, add it to the cache and return NULL
			
 
				  */
			
 
				-void *_starpu_mpi_cache_received_data_set(starpu_data_handle_t data, int mpi_rank);
			
 
				-void *_starpu_mpi_cache_received_data_get(starpu_data_handle_t data, int mpi_rank);
			
 
				+int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data);
			
 
				+int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data);
			
 
				 void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data);
			
 
				 
			
 
				 /*
			
 
				  * If the data is already available in the cache, return a pointer to the data
			
 
				  * If the data is NOT available in the cache, add it to the cache and return NULL
			
 
				  */
			
 
				-void *_starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest);
			
 
				-void *_starpu_mpi_cache_sent_data_get(starpu_data_handle_t data, int dest);
			
 
				-void _starpu_mpi_cache_sent_data_clear(MPI_Comm comm, starpu_data_handle_t data);
			
 
				+int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest);
			
 
				+int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data, int dest);
			
 
				+void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data);
			
 
				 
			
 
				-void _starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
			
 
				+void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/mpi/src/starpu_mpi_cache_stats.c
+++ b/mpi/src/starpu_mpi_cache_stats.c
@@ -19,12 +19,9 @@
 
				 #include <stdio.h>
			
 
				 #include <starpu_mpi_private.h>
			
 
				 
			
 
				-/* measure the amount of data transfers between each pair of MPI nodes */
			
 
				-static size_t *comm_cache_amount;
			
 
				-static int world_size;
			
 
				 static int stats_enabled=0;
			
 
				 
			
 
				-void _starpu_mpi_cache_stats_init(MPI_Comm comm)
			
 
				+void _starpu_mpi_cache_stats_init()
			
 
				 {
			
 
				 	stats_enabled = starpu_get_env_number("STARPU_MPI_CACHE_STATS");
			
 
				 	if (stats_enabled == -1)
			
@@ -35,16 +32,11 @@ void _starpu_mpi_cache_stats_init(MPI_Comm comm)
 
				 
			
 
				 	_STARPU_DISP("Warning: StarPU is executed with STARPU_MPI_CACHE_STATS=1, which slows down a bit\n");
			
 
				 
			
 
				-	starpu_mpi_comm_size(comm, &world_size);
			
 
				-	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
			
 
				-
			
 
				-	_STARPU_MPI_CALLOC(comm_cache_amount, world_size, sizeof(size_t));
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_cache_stats_shutdown()
			
 
				 {
			
 
				 	if (stats_enabled == 0) return;
			
 
				-	free(comm_cache_amount);
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count)
			
@@ -63,7 +55,5 @@ void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_hand
 
				 	{
			
 
				 		_STARPU_MPI_MSG("[communication cache] - %10ld from %d\n", (long)size, dst);
			
 
				 	}
			
 
				-
			
 
				-	comm_cache_amount[dst] += count * size;
			
 
				 }
			
 
				 
			
--- a/mpi/src/starpu_mpi_cache_stats.h
+++ b/mpi/src/starpu_mpi_cache_stats.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,7 +25,7 @@
 
				 extern "C" {
			
 
				 #endif
			
 
				 
			
 
				-void _starpu_mpi_cache_stats_init(MPI_Comm comm);
			
 
				+void _starpu_mpi_cache_stats_init();
			
 
				 void _starpu_mpi_cache_stats_shutdown();
			
 
				 
			
 
				 void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count);
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -186,6 +186,16 @@ struct _starpu_mpi_node_tag
 
				 	int data_tag;
			
 
				 };
			
 
				 
			
 
				+struct _starpu_mpi_data
			
 
				+{
			
 
				+	int magic;
			
 
				+	struct _starpu_mpi_node_tag node_tag;
			
 
				+	starpu_pthread_mutex_t *cache_sent_mutex;
			
 
				+	int *cache_sent;
			
 
				+	starpu_pthread_mutex_t cache_received_mutex;
			
 
				+	int cache_received;
			
 
				+};
			
 
				+
			
 
				 LIST_TYPE(_starpu_mpi_req,
			
 
				 	/* description of the data at StarPU level */
			
 
				 	starpu_data_handle_t data_handle;
			
--- a/mpi/src/starpu_mpi_tag.c
+++ b/mpi/src/starpu_mpi_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011-2015  Université de Bordeaux
			
 
				  * Copyright (C) 2014 INRIA
			
 
				  *
			
@@ -102,7 +102,7 @@ int _starpu_mpi_tag_data_release(starpu_data_handle_t handle)
 
				 		struct handle_tag_entry *tag_entry;
			
 
				 
			
 
				 		_starpu_spin_lock(&registered_tag_handles_lock);
			
 
				-		HASH_FIND_INT(registered_tag_handles, &(((struct _starpu_mpi_node_tag *)(handle->mpi_data))->data_tag), tag_entry);
			
 
				+		HASH_FIND_INT(registered_tag_handles, &(((struct _starpu_mpi_data *)(handle->mpi_data))->node_tag.data_tag), tag_entry);
			
 
				 		STARPU_ASSERT_MSG((tag_entry != NULL),"Data handle %p with tag %d isn't in the hashmap !",handle,tag);
			
 
				 
			
 
				 		HASH_DEL(registered_tag_handles, tag_entry);
			
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -110,8 +110,8 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 		if (do_execute && mpi_rank != me)
			
 
				 		{
			
 
				 			/* The node is going to execute the codelet, but it does not own the data, it needs to receive the data from the owner node */
			
 
				-			void *already_received = _starpu_mpi_cache_received_data_set(data, mpi_rank);
			
 
				-			if (already_received == NULL)
			
 
				+			int already_received = _starpu_mpi_cache_received_data_set(data);
			
 
				+			if (already_received == 0)
			
 
				 			{
			
 
				 				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
			
 
				 				starpu_mpi_irecv_detached(data, mpi_rank, data_tag, comm, NULL, NULL);
			
@@ -122,8 +122,8 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 		if (!do_execute && mpi_rank == me)
			
 
				 		{
			
 
				 			/* The node owns the data, but another node is going to execute the codelet, the node needs to send the data to the executee node. */
			
 
				-			void *already_sent = _starpu_mpi_cache_sent_data_set(data, xrank);
			
 
				-			if (already_sent == NULL)
			
 
				+			int already_sent = _starpu_mpi_cache_sent_data_set(data, xrank);
			
 
				+			if (already_sent == 0)
			
 
				 			{
			
 
				 				_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data, xrank);
			
 
				 				_SEND_DATA(data, mode, xrank, data_tag, comm, NULL, NULL);
			
@@ -165,14 +165,14 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 
				 }
			
 
				 
			
 
				 static
			
 
				-void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int do_execute, MPI_Comm comm)
			
 
				+void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int do_execute)
			
 
				 {
			
 
				 	if (_starpu_cache_enabled)
			
 
				 	{
			
 
				 		if (mode & STARPU_W || mode & STARPU_REDUX)
			
 
				 		{
			
 
				 			/* The data has been modified, it MUST be removed from the cache */
			
 
				-			_starpu_mpi_cache_sent_data_clear(comm, data);
			
 
				+			_starpu_mpi_cache_sent_data_clear(data);
			
 
				 			_starpu_mpi_cache_received_data_clear(data);
			
 
				 		}
			
 
				 	}
			
@@ -503,7 +503,7 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 
				 	for(i=0 ; i<nb_data ; i++)
			
 
				 	{
			
 
				 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
			
 
				-		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute, comm);
			
 
				+		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
			
 
				 	}
			
 
				 
			
 
				 	free(descrs);
			
--- a/mpi/tests/cache.c
+++ b/mpi/tests/cache.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -60,23 +60,23 @@ struct starpu_codelet mycodelet_rw =
 
				 
			
 
				 void test(struct starpu_codelet *codelet, enum starpu_data_access_mode mode, starpu_data_handle_t data, int rank, int in_cache)
			
 
				 {
			
 
				-	void *ptr;
			
 
				+	int cache;
			
 
				 	int ret;
			
 
				 
			
 
				 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, codelet, mode, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				 
			
 
				-	ptr = _starpu_mpi_cache_received_data_get(data, 0);
			
 
				+	cache = _starpu_mpi_cache_received_data_get(data);
			
 
				 
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				 	     if (in_cache)
			
 
				 	     {
			
 
				-		     STARPU_ASSERT_MSG(ptr != NULL, "Data should be in cache\n");
			
 
				+		     STARPU_ASSERT_MSG(cache == 1, "Data should be in cache\n");
			
 
				 	     }
			
 
				 	     else
			
 
				 	     {
			
 
				-		     STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
			
 
				+		     STARPU_ASSERT_MSG(cache == 0, "Data should NOT be in cache\n");
			
 
				 	     }
			
 
				 	}
			
 
				 }
			
--- a/mpi/tests/cache_disable.c
+++ b/mpi/tests/cache_disable.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -48,7 +48,7 @@ int main(int argc, char **argv)
 
				 	int ret;
			
 
				 	unsigned *val;
			
 
				 	starpu_data_handle_t data;
			
 
				-	void *ptr = NULL;
			
 
				+	int in_cache;
			
 
				 	int cache;
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
@@ -73,28 +73,28 @@ int main(int argc, char **argv)
 
				 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				 
			
 
				-	ptr = _starpu_mpi_cache_received_data_get(data, 0);
			
 
				+	in_cache = _starpu_mpi_cache_received_data_get(data);
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				-		STARPU_ASSERT_MSG(ptr != NULL, "Data should be in cache\n");
			
 
				+		STARPU_ASSERT_MSG(in_cache == 1, "Data should be in cache\n");
			
 
				 	}
			
 
				 
			
 
				 	// We clean the cache
			
 
				 	starpu_mpi_cache_set(0);
			
 
				 
			
 
				 	// We check the data is no longer in the cache
			
 
				-	ptr = _starpu_mpi_cache_received_data_get(data, 0);
			
 
				+	in_cache = _starpu_mpi_cache_received_data_get(data);
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				-		STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
			
 
				+		STARPU_ASSERT_MSG(in_cache == 0, "Data should NOT be in cache\n");
			
 
				 	}
			
 
				 
			
 
				 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				-	ptr = _starpu_mpi_cache_received_data_get(data, 0);
			
 
				+	in_cache = _starpu_mpi_cache_received_data_get(data);
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				-		STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
			
 
				+		STARPU_ASSERT_MSG(in_cache == 0, "Data should NOT be in cache\n");
			
 
				 	}
			
 
				 
			
 
				 	FPRINTF(stderr, "Waiting ...\n");
			
--- a/mpi/tests/insert_task_node_choice.c
+++ b/mpi/tests/insert_task_node_choice.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -54,7 +54,7 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret, rank, size, err, node;
			
 
				 	int x0=32;
			
 
				-	long x1=23;
			
 
				+	long long x1=23;
			
 
				 	starpu_data_handle_t data_handlesx0;
			
 
				 	starpu_data_handle_t data_handlesx1;
			
 
				 
			
@@ -96,7 +96,12 @@ int main(int argc, char **argv)
 
				 				     0);
			
 
				 	assert(err == 0);
			
 
				 
			
 
				-	node = 1; // Node 1 has a long which is bigger than a int
			
 
				+	// Node 1 has a long long data which has a bigger size than a
			
 
				+	// int, so it is going to be selected by the node selection
			
 
				+	// policy to execute the codelet
			
 
				+	err = starpu_mpi_node_selection_set_current_policy(STARPU_MPI_NODE_SELECTION_MOST_R_DATA);
			
 
				+	assert(err == 0);
			
 
				+	node = 1;
			
 
				 	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
			
 
				 				     STARPU_VALUE, &node, sizeof(node),
			
 
				 				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
			
--- a/mpi/tests/policy_register_toomany.c
+++ b/mpi/tests/policy_register_toomany.c
@@ -32,6 +32,8 @@ int main(int argc, char **argv)
 
				 	int ret;
			
 
				 	int i;
			
 
				 
			
 
				+	disable_coredump();
			
 
				+
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 	ret = starpu_mpi_init(&argc, &argv, 1);
			
--- a/mpi/tests/policy_unregister.c
+++ b/mpi/tests/policy_unregister.c
@@ -21,6 +21,8 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				+	disable_coredump();
			
 
				+
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 	ret = starpu_mpi_init(&argc, &argv, 1);
			
--- a/mpi/tests/starpu_redefine.c
+++ b/mpi/tests/starpu_redefine.c
@@ -15,12 +15,15 @@
 
				  */
			
 
				 
			
 
				 #include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret;
			
 
				 	starpu_data_handle_t handle;
			
 
				 
			
 
				+	disable_coredump();
			
 
				+
			
 
				 	MPI_Init(&argc, &argv);
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
--- a/socl/src/init.c
+++ b/socl/src/init.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012, 2016 University of Bordeaux
			
 
				- * Copyright (C) 2012,2014,2016 CNRS
			
 
				+ * Copyright (C) 2012,2014,2016,2017 CNRS
			
 
				  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -57,6 +57,8 @@ int socl_init_starpu(void) {
 
				   STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
			
 
				 
			
 
				   starpu_conf_init(&conf);
			
 
				+  unsetenv("STARPU_NCPU");
			
 
				+  unsetenv("STARPU_NCUDA");
			
 
				   conf.ncuda = 0;
			
 
				   conf.ncpus = 0;
			
 
				 
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -81,6 +81,7 @@ noinst_HEADERS = 						\
 
				 	core/perfmodel/multiple_regression.h			\
			
 
				 	core/jobs.h						\
			
 
				 	core/task.h						\
			
 
				+	core/drivers.h						\
			
 
				 	core/workers.h						\
			
 
				 	core/topology.h						\
			
 
				 	core/debug.h						\
			
@@ -171,6 +172,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 
				 	core/task.c						\
			
 
				 	core/task_bundle.c					\
			
 
				 	core/tree.c						\
			
 
				+	core/drivers.c						\
			
 
				 	core/workers.c						\
			
 
				 	core/combined_workers.c					\
			
 
				 	core/topology.c						\
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -81,10 +81,10 @@
 
				 #define _STARPU_FUT_USER_DEFINED_START	0x5120
			
 
				 #define _STARPU_FUT_USER_DEFINED_END	0x5121
			
 
				 
			
 
				-#define	_STARPU_FUT_NEW_MEM_NODE		0x5122
			
 
				+#define	_STARPU_FUT_NEW_MEM_NODE	0x5122
			
 
				 
			
 
				 #define	_STARPU_FUT_START_CALLBACK	0x5123
			
 
				-#define	_STARPU_FUT_END_CALLBACK		0x5124
			
 
				+#define	_STARPU_FUT_END_CALLBACK	0x5124
			
 
				 
			
 
				 #define	_STARPU_FUT_TASK_DONE		0x5125
			
 
				 #define	_STARPU_FUT_TAG_DONE		0x5126
			
@@ -115,43 +115,43 @@
 
				 
			
 
				 #define _STARPU_FUT_TASK_WAIT_FOR_ALL	0x513b
			
 
				 
			
 
				-#define _STARPU_FUT_EVENT	0x513c
			
 
				+#define _STARPU_FUT_EVENT		0x513c
			
 
				 #define _STARPU_FUT_THREAD_EVENT	0x513d
			
 
				 
			
 
				 #define	_STARPU_FUT_CODELET_DETAILS	0x513e
			
 
				 #define	_STARPU_FUT_CODELET_DATA	0x513f
			
 
				 
			
 
				-#define _STARPU_FUT_LOCKING_MUTEX	0x5140	
			
 
				-#define _STARPU_FUT_MUTEX_LOCKED	0x5141	
			
 
				+#define _STARPU_FUT_LOCKING_MUTEX	0x5140
			
 
				+#define _STARPU_FUT_MUTEX_LOCKED	0x5141
			
 
				 
			
 
				-#define _STARPU_FUT_UNLOCKING_MUTEX		0x5142	
			
 
				-#define _STARPU_FUT_MUTEX_UNLOCKED		0x5143	
			
 
				+#define _STARPU_FUT_UNLOCKING_MUTEX	0x5142
			
 
				+#define _STARPU_FUT_MUTEX_UNLOCKED	0x5143
			
 
				 
			
 
				-#define _STARPU_FUT_TRYLOCK_MUTEX		0x5144	
			
 
				+#define _STARPU_FUT_TRYLOCK_MUTEX	0x5144
			
 
				 
			
 
				-#define _STARPU_FUT_RDLOCKING_RWLOCK	0x5145	
			
 
				-#define _STARPU_FUT_RWLOCK_RDLOCKED		0x5146	
			
 
				+#define _STARPU_FUT_RDLOCKING_RWLOCK	0x5145
			
 
				+#define _STARPU_FUT_RWLOCK_RDLOCKED	0x5146
			
 
				 
			
 
				-#define _STARPU_FUT_WRLOCKING_RWLOCK	0x5147	
			
 
				-#define _STARPU_FUT_RWLOCK_WRLOCKED		0x5148	
			
 
				+#define _STARPU_FUT_WRLOCKING_RWLOCK	0x5147
			
 
				+#define _STARPU_FUT_RWLOCK_WRLOCKED	0x5148
			
 
				 
			
 
				-#define _STARPU_FUT_UNLOCKING_RWLOCK	0x5149	
			
 
				-#define _STARPU_FUT_RWLOCK_UNLOCKED		0x514a	
			
 
				+#define _STARPU_FUT_UNLOCKING_RWLOCK	0x5149
			
 
				+#define _STARPU_FUT_RWLOCK_UNLOCKED	0x514a
			
 
				 
			
 
				-#define _STARPU_FUT_LOCKING_SPINLOCK	0x514b	
			
 
				-#define _STARPU_FUT_SPINLOCK_LOCKED		0x514c	
			
 
				+#define _STARPU_FUT_LOCKING_SPINLOCK	0x514b
			
 
				+#define _STARPU_FUT_SPINLOCK_LOCKED	0x514c
			
 
				 
			
 
				-#define _STARPU_FUT_UNLOCKING_SPINLOCK		0x514d	
			
 
				-#define _STARPU_FUT_SPINLOCK_UNLOCKED		0x514e	
			
 
				+#define _STARPU_FUT_UNLOCKING_SPINLOCK	0x514d
			
 
				+#define _STARPU_FUT_SPINLOCK_UNLOCKED	0x514e
			
 
				 
			
 
				-#define _STARPU_FUT_TRYLOCK_SPINLOCK		0x514f	
			
 
				+#define _STARPU_FUT_TRYLOCK_SPINLOCK	0x514f
			
 
				 
			
 
				-#define _STARPU_FUT_COND_WAIT_BEGIN		0x5150
			
 
				-#define _STARPU_FUT_COND_WAIT_END		0x5151
			
 
				+#define _STARPU_FUT_COND_WAIT_BEGIN	0x5150
			
 
				+#define _STARPU_FUT_COND_WAIT_END	0x5151
			
 
				 
			
 
				-#define _STARPU_FUT_MEMORY_FULL			0x5152
			
 
				+#define _STARPU_FUT_MEMORY_FULL		0x5152
			
 
				 
			
 
				-#define _STARPU_FUT_DATA_LOAD 0x5153
			
 
				+#define _STARPU_FUT_DATA_LOAD 		0x5153
			
 
				 
			
 
				 #define _STARPU_FUT_START_UNPARTITION_ON_TID 0x5154
			
 
				 #define _STARPU_FUT_END_UNPARTITION_ON_TID 0x5155
			
@@ -166,7 +166,7 @@
 
				 #define _STARPU_FUT_SCHED_COMPONENT_POP_PRIO 	0x515b
			
 
				 
			
 
				 #define	_STARPU_FUT_START_WRITEBACK_ASYNC	0x515c
			
 
				-#define	_STARPU_FUT_END_WRITEBACK_ASYNC	0x515d
			
 
				+#define	_STARPU_FUT_END_WRITEBACK_ASYNC		0x515d
			
 
				 
			
 
				 #define	_STARPU_FUT_HYPERVISOR_BEGIN    0x5160
			
 
				 #define	_STARPU_FUT_HYPERVISOR_END	0x5161
			
@@ -194,7 +194,7 @@
 
				 #define _STARPU_FUT_TASK_BUILD_END	0x5171
			
 
				 
			
 
				 #define _STARPU_FUT_TASK_MPI_DECODE_START	0x5172
			
 
				-#define _STARPU_FUT_TASK_MPI_DECODE_END	0x5173
			
 
				+#define _STARPU_FUT_TASK_MPI_DECODE_END		0x5173
			
 
				 
			
 
				 #define _STARPU_FUT_TASK_MPI_PRE_START	0x5174
			
 
				 #define _STARPU_FUT_TASK_MPI_PRE_END	0x5175
			
@@ -208,8 +208,8 @@
 
				 #define _STARPU_FUT_TASK_WAIT_FOR_ALL_START	0x517a
			
 
				 #define _STARPU_FUT_TASK_WAIT_FOR_ALL_END	0x517b
			
 
				 
			
 
				-#define _STARPU_FUT_HANDLE_DATA_REGISTER 0x517c
			
 
				-#define _STARPU_FUT_DATA_INVALIDATE 0x517d
			
 
				+#define _STARPU_FUT_HANDLE_DATA_REGISTER	0x517c
			
 
				+#define _STARPU_FUT_DATA_INVALIDATE		0x517d
			
 
				 
			
 
				 #define _STARPU_FUT_START_FETCH_INPUT	0x517e
			
 
				 #define _STARPU_FUT_END_FETCH_INPUT	0x517f
			
@@ -688,53 +688,51 @@ do {										\
 
				 
			
 
				 #define _STARPU_TRACE_START_ALLOC(memnode, size)		\
			
 
				 	FUT_DO_PROBE3(_STARPU_FUT_START_ALLOC, memnode, _starpu_gettid(), size);
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_END_ALLOC(memnode)		\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC, memnode, _starpu_gettid());
			
 
				 
			
 
				 #define _STARPU_TRACE_START_ALLOC_REUSE(memnode, size)		\
			
 
				 	FUT_DO_PROBE3(_STARPU_FUT_START_ALLOC_REUSE, memnode, _starpu_gettid(), size);
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_END_ALLOC_REUSE(memnode)		\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_END_ALLOC_REUSE, memnode, _starpu_gettid());
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_START_FREE(memnode, size)		\
			
 
				 	FUT_DO_PROBE3(_STARPU_FUT_START_FREE, memnode, _starpu_gettid(), size);
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_END_FREE(memnode)		\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_END_FREE, memnode, _starpu_gettid());
			
 
				 
			
 
				 #define _STARPU_TRACE_START_WRITEBACK(memnode)		\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_START_WRITEBACK, memnode, _starpu_gettid());
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_END_WRITEBACK(memnode)		\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_END_WRITEBACK, memnode, _starpu_gettid());
			
 
				 
			
 
				 #define _STARPU_TRACE_USED_MEM(memnode,used)		\
			
 
				 	FUT_DO_PROBE3(_STARPU_FUT_USED_MEM, memnode, used, _starpu_gettid());
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_START_MEMRECLAIM(memnode,is_prefetch)		\
			
 
				 	FUT_DO_PROBE3(_STARPU_FUT_START_MEMRECLAIM, memnode, is_prefetch, _starpu_gettid());
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_END_MEMRECLAIM(memnode, is_prefetch)		\
			
 
				 	FUT_DO_PROBE3(_STARPU_FUT_END_MEMRECLAIM, memnode, is_prefetch, _starpu_gettid());
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_START_WRITEBACK_ASYNC(memnode)		\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_START_WRITEBACK_ASYNC, memnode, _starpu_gettid());
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_END_WRITEBACK_ASYNC(memnode)		\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_END_WRITEBACK_ASYNC, memnode, _starpu_gettid());
			
 
				-	
			
 
				+
			
 
				 /* We skip these events becasue they are called so often that they cause FxT to
			
 
				  * fail and make the overall trace unreadable anyway. */
			
 
				 #define _STARPU_TRACE_START_PROGRESS(memnode)		\
			
 
				-	do {} while (0)
			
 
				-//	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS, memnode, _starpu_gettid());
			
 
				+	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS_ON_TID, memnode, _starpu_gettid());
			
 
				 
			
 
				 #define _STARPU_TRACE_END_PROGRESS(memnode)		\
			
 
				-	do {} while (0)
			
 
				-	//FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS, memnode, _starpu_gettid());
			
 
				-	
			
 
				+	FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS_ON_TID, memnode, _starpu_gettid());
			
 
				+
			
 
				 #define _STARPU_TRACE_USER_EVENT(code)			\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_USER_EVENT, code, _starpu_gettid());
			
 
				 
			
@@ -756,7 +754,7 @@ do {										\
 
				 #define _STARPU_TRACE_HYPERVISOR_END() \
			
 
				 	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_END, _starpu_gettid());
			
 
				 
			
 
				-#ifdef STARPU_FXT_LOCK_TRACES 
			
 
				+#ifdef STARPU_FXT_LOCK_TRACES
			
 
				 
			
 
				 #define _STARPU_TRACE_LOCKING_MUTEX()	do { \
			
 
				 	const char *file; \
			
@@ -894,24 +892,24 @@ do {										\
 
				 
			
 
				 #define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
			
 
				 #define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
			
 
				-#define _STARPU_TRACE_MUTEX_UNLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_MUTEX()			do {} while(0)
			
 
				+#define _STARPU_TRACE_MUTEX_UNLOCKED()			do {} while(0)
			
 
				 #define _STARPU_TRACE_TRYLOCK_MUTEX()			do {} while(0)
			
 
				 #define _STARPU_TRACE_RDLOCKING_RWLOCK()		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_RDLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_RDLOCKED()			do {} while(0)
			
 
				 #define _STARPU_TRACE_WRLOCKING_RWLOCK()		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_WRLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_WRLOCKED()			do {} while(0)
			
 
				 #define _STARPU_TRACE_UNLOCKING_RWLOCK()		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_UNLOCKED()		do {} while(0)
			
 
				-#define _STARPU_TRACE_LOCKING_SPINLOCK(file, line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_SPINLOCK_LOCKED(file, line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file, line)	do {} while(0)
			
 
				-#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file, line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file, line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_UNLOCKED()			do {} while(0)
			
 
				+#define _STARPU_TRACE_LOCKING_SPINLOCK(file, line)	do {(void) file; (void)line;} while(0)
			
 
				+#define _STARPU_TRACE_SPINLOCK_LOCKED(file, line)	do {(void) file; (void)line;} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file, line)	do {(void) file; (void)line;} while(0)
			
 
				+#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file, line)	do {(void) file; (void)line;} while(0)
			
 
				+#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file, line)	do {(void) file; (void)line;} while(0)
			
 
				+#define _STARPU_TRACE_COND_WAIT_BEGIN()			do {} while(0)
			
 
				 #define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
			
 
				 #define _STARPU_TRACE_BARRIER_WAIT_BEGIN()		do {} while(0)
			
 
				-#define _STARPU_TRACE_BARRIER_WAIT_END()			do {} while(0)
			
 
				+#define _STARPU_TRACE_BARRIER_WAIT_END()		do {} while(0)
			
 
				 
			
 
				 #endif // STARPU_FXT_LOCK_TRACES
			
 
				 
			
@@ -923,7 +921,7 @@ do {										\
 
				 
			
 
				 #define _STARPU_TRACE_START_UNPARTITION(handle, memnode)		\
			
 
				 	FUT_DO_PROBE3(_STARPU_FUT_START_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
			
 
				-	
			
 
				+
			
 
				 #define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		\
			
 
				 	FUT_DO_PROBE3(_STARPU_FUT_END_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
			
 
				 
			
@@ -952,49 +950,49 @@ do {										\
 
				 #define _STARPU_TRACE_DATA_INVALIDATE(handle, node)		\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_DATA_INVALIDATE, handle, node)
			
 
				 #else
			
 
				-#define _STARPU_TRACE_DATA_INVALIDATE(handle, node)	do {} while (0)
			
 
				+#define _STARPU_TRACE_DATA_INVALIDATE(handle, node)	do {(void) handle; (void) node;} while (0)
			
 
				 #endif
			
 
				 
			
 
				 #else // !STARPU_USE_FXT
			
 
				 
			
 
				 /* Dummy macros in case FxT is disabled */
			
 
				-#define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
			
 
				-#define _STARPU_TRACE_WORKER_INIT_START(a,b,c,d,e,f)	do {} while(0)
			
 
				-#define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, perf_arch, workerid)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_EXECUTING()	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_EXECUTING()	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
			
 
				-#define _STARPU_TRACE_JOB_PUSH(task, prio)	do {} while(0)
			
 
				-#define _STARPU_TRACE_JOB_POP(task, prio)	do {} while(0)
			
 
				-#define _STARPU_TRACE_UPDATE_TASK_CNT(counter)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_FETCH_INPUT(job)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_FETCH_INPUT(job)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_PUSH_OUTPUT(job)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_PUSH_OUTPUT(job)	do {} while(0)
			
 
				-#define _STARPU_TRACE_TAG(tag, job)	do {} while(0)
			
 
				-#define _STARPU_TRACE_TAG_DEPS(a, b)	do {} while(0)
			
 
				-#define _STARPU_TRACE_TASK_DEPS(a, b)		do {} while(0)
			
 
				-#define _STARPU_TRACE_GHOST_TASK_DEPS(a, b)	do {} while(0)
			
 
				-#define _STARPU_TRACE_TASK_DONE(a)		do {} while(0)
			
 
				-#define _STARPU_TRACE_TAG_DONE(a)		do {} while(0)
			
 
				-#define _STARPU_TRACE_DATA_COPY(a, b, c)		do {} while(0)
			
 
				-#define _STARPU_TRACE_START_DRIVER_COPY(a,b,c,d,e,f)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_DRIVER_COPY(a,b,c,d,e)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_DRIVER_COPY_ASYNC(a,b)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_DRIVER_COPY_ASYNC(a,b)	do {} while(0)
			
 
				-#define _STARPU_TRACE_WORK_STEALING(a, b)	do {} while(0)
			
 
				-#define _STARPU_TRACE_WORKER_DEINIT_START	do {} while(0)
			
 
				-#define _STARPU_TRACE_WORKER_DEINIT_END(a)	do {} while(0)
			
 
				+#define _STARPU_TRACE_NEW_MEM_NODE(nodeid)		do {(void)(nodeid);} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_INIT_START(a,b,c,d,e,f)	do {(void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f);} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_INIT_END(workerid)		do {(void)(workerid);} while(0)
			
 
				+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, perf_arch, workerid) 	do {(void)(job); (void)(nimpl); (void)(perf_arch); (void)(workerid);} while(0)
			
 
				+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, perf_arch, workerid)		do {(void)(job); (void)(nimpl); (void)(perf_arch); (void)(workerid);} while(0)
			
 
				+#define _STARPU_TRACE_START_EXECUTING()		do {} while(0)
			
 
				+#define _STARPU_TRACE_END_EXECUTING()		do {} while(0)
			
 
				+#define _STARPU_TRACE_START_CALLBACK(job)	do {(void)(job);} while(0)
			
 
				+#define _STARPU_TRACE_END_CALLBACK(job)		do {(void)(job);} while(0)
			
 
				+#define _STARPU_TRACE_JOB_PUSH(task, prio)	do {(void)(task); (void)(prio);} while(0)
			
 
				+#define _STARPU_TRACE_JOB_POP(task, prio)	do {(void)(task); (void)(prio);} while(0)
			
 
				+#define _STARPU_TRACE_UPDATE_TASK_CNT(counter)	do {(void)(counter);} while(0)
			
 
				+#define _STARPU_TRACE_START_FETCH_INPUT(job)	do {(void)(job);} while(0)
			
 
				+#define _STARPU_TRACE_END_FETCH_INPUT(job)	do {(void)(job);} while(0)
			
 
				+#define _STARPU_TRACE_START_PUSH_OUTPUT(job)	do {(void)(job);} while(0)
			
 
				+#define _STARPU_TRACE_END_PUSH_OUTPUT(job)	do {(void)(job);} while(0)
			
 
				+#define _STARPU_TRACE_TAG(tag, job)		do {(void)(tag); (void)(job);} while(0)
			
 
				+#define _STARPU_TRACE_TAG_DEPS(a, b)		do {(void)(a); (void)(b);} while(0)
			
 
				+#define _STARPU_TRACE_TASK_DEPS(a, b)		do {(void)(a); (void)(b);} while(0)
			
 
				+#define _STARPU_TRACE_GHOST_TASK_DEPS(a, b)	do {(void)(a); (void)(b);} while(0)
			
 
				+#define _STARPU_TRACE_TASK_DONE(a)		do {(void)(a);} while(0)
			
 
				+#define _STARPU_TRACE_TAG_DONE(a)		do {(void)(a);} while(0)
			
 
				+#define _STARPU_TRACE_DATA_COPY(a, b, c)		do {(void)(a); (void)(b); (void)(c);} while(0)
			
 
				+#define _STARPU_TRACE_START_DRIVER_COPY(a,b,c,d,e,f)	do {(void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f);} while(0)
			
 
				+#define _STARPU_TRACE_END_DRIVER_COPY(a,b,c,d,e)	do {(void)(a); (void)(b); (void)(c); (void)(d); (void)(e);} while(0)
			
 
				+#define _STARPU_TRACE_START_DRIVER_COPY_ASYNC(a,b)	do {(void)(a); (void)(b);} while(0)
			
 
				+#define _STARPU_TRACE_END_DRIVER_COPY_ASYNC(a,b)	do {(void)(a); (void)(b);} while(0)
			
 
				+#define _STARPU_TRACE_WORK_STEALING(a, b)		do {(void)(a); (void)(b);} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_DEINIT_START		do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_DEINIT_END(a)		do {(void)(a);} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_SCHEDULING_START		do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_SCHEDULING_END		do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_SCHEDULING_PUSH		do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_SCHEDULING_POP		do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0)
			
 
				-#define _STARPU_TRACE_WORKER_SLEEP_END		do {} while(0)
			
 
				-#define _STARPU_TRACE_TASK_SUBMIT(job)		do {} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_SLEEP_END			do {} while(0)
			
 
				+#define _STARPU_TRACE_TASK_SUBMIT(job)			do {(void)(job);} while(0)
			
 
				 #define _STARPU_TRACE_TASK_SUBMIT_START()		do {} while(0)
			
 
				 #define _STARPU_TRACE_TASK_SUBMIT_END()			do {} while(0)
			
 
				 #define _STARPU_TRACE_TASK_BUILD_START()		do {} while(0)
			
@@ -1005,68 +1003,68 @@ do {										\
 
				 #define _STARPU_TRACE_TASK_MPI_PRE_END()		do {} while(0)
			
 
				 #define _STARPU_TRACE_TASK_MPI_POST_START()		do {} while(0)
			
 
				 #define _STARPU_TRACE_TASK_MPI_POST_END()		do {} while(0)
			
 
				-#define _STARPU_TRACE_TASK_WAIT_START(job)		do {} while(0)
			
 
				+#define _STARPU_TRACE_TASK_WAIT_START(job)		do {(void)(job);} while(0)
			
 
				 #define _STARPU_TRACE_TASK_WAIT_END()			do {} while(0)
			
 
				 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL_START()		do {} while(0)
			
 
				 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL_END()		do {} while(0)
			
 
				-#define _STARPU_TRACE_USER_DEFINED_START		do {} while(0)
			
 
				-#define _STARPU_TRACE_USER_DEFINED_END		do {} while(0)
			
 
				-#define _STARPU_TRACE_START_ALLOC(memnode, size)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_ALLOC(memnode)		do {} while(0)
			
 
				-#define _STARPU_TRACE_START_ALLOC_REUSE(a, size)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_ALLOC_REUSE(a)		do {} while(0)
			
 
				-#define _STARPU_TRACE_START_FREE(memnode, size)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_FREE(memnode)		do {} while(0)
			
 
				-#define _STARPU_TRACE_START_WRITEBACK(memnode)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_WRITEBACK(memnode)		do {} while(0)
			
 
				-#define _STARPU_TRACE_USED_MEM(memnode,used)		do {} while (0)
			
 
				-#define _STARPU_TRACE_START_MEMRECLAIM(memnode,is_prefetch)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_MEMRECLAIM(memnode,is_prefetch)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_WRITEBACK_ASYNC(memnode)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_WRITEBACK_ASYNC(memnode)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_PROGRESS(memnode)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_PROGRESS(memnode)	do {} while(0)
			
 
				-#define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
			
 
				-#define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
			
 
				-#define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
			
 
				-#define _STARPU_TRACE_EVENT(S)		do {} while(0)
			
 
				-#define _STARPU_TRACE_THREAD_EVENT(S)		do {} while(0)
			
 
				+#define _STARPU_TRACE_USER_DEFINED_START()		do {} while(0)
			
 
				+#define _STARPU_TRACE_USER_DEFINED_END()		do {} while(0)
			
 
				+#define _STARPU_TRACE_START_ALLOC(memnode, size)	do {(void)(memnode); (void)(size);} while(0)
			
 
				+#define _STARPU_TRACE_END_ALLOC(memnode)		do {(void)(memnode);} while(0)
			
 
				+#define _STARPU_TRACE_START_ALLOC_REUSE(a, size)	do {(void)(a); (void)(size);} while(0)
			
 
				+#define _STARPU_TRACE_END_ALLOC_REUSE(a)		do {(void)(a);} while(0)
			
 
				+#define _STARPU_TRACE_START_FREE(memnode, size)		do {(void)(memnode); (void)(size);} while(0)
			
 
				+#define _STARPU_TRACE_END_FREE(memnode)			do {(void)(memnode);} while(0)
			
 
				+#define _STARPU_TRACE_START_WRITEBACK(memnode)		do {(void)(memnode);} while(0)
			
 
				+#define _STARPU_TRACE_END_WRITEBACK(memnode)		do {(void)(memnode);} while(0)
			
 
				+#define _STARPU_TRACE_USED_MEM(memnode,used)		do {(void)(memnode); (void)(used);} while (0)
			
 
				+#define _STARPU_TRACE_START_MEMRECLAIM(memnode,is_prefetch)	do {(void)(memnode); (void)(is_prefetch);} while(0)
			
 
				+#define _STARPU_TRACE_END_MEMRECLAIM(memnode,is_prefetch)	do {(void)(memnode); (void)(is_prefetch);} while(0)
			
 
				+#define _STARPU_TRACE_START_WRITEBACK_ASYNC(memnode)	do {(void)(memnode);} while(0)
			
 
				+#define _STARPU_TRACE_END_WRITEBACK_ASYNC(memnode)	do {(void)(memnode);} while(0)
			
 
				+#define _STARPU_TRACE_START_PROGRESS(memnode)		do {(void)( memnode);} while(0)
			
 
				+#define _STARPU_TRACE_END_PROGRESS(memnode)		do {(void)( memnode);} while(0)
			
 
				+#define _STARPU_TRACE_USER_EVENT(code)			do {(void)(code);} while(0)
			
 
				+#define _STARPU_TRACE_SET_PROFILING(status)		do {(void)(status);} while(0)
			
 
				+#define _STARPU_TRACE_TASK_WAIT_FOR_ALL()		do {} while(0)
			
 
				+#define _STARPU_TRACE_EVENT(S)				do {(void)(S);} while(0)
			
 
				+#define _STARPU_TRACE_THREAD_EVENT(S)			do {(void)(S);} while(0)
			
 
				 #define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
			
 
				 #define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
			
 
				-#define _STARPU_TRACE_MUTEX_UNLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_MUTEX()			do {} while(0)
			
 
				+#define _STARPU_TRACE_MUTEX_UNLOCKED()			do {} while(0)
			
 
				 #define _STARPU_TRACE_TRYLOCK_MUTEX()			do {} while(0)
			
 
				 #define _STARPU_TRACE_RDLOCKING_RWLOCK()		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_RDLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_RDLOCKED()			do {} while(0)
			
 
				 #define _STARPU_TRACE_WRLOCKING_RWLOCK()		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_WRLOCKED()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_WRLOCKED()			do {} while(0)
			
 
				 #define _STARPU_TRACE_UNLOCKING_RWLOCK()		do {} while(0)
			
 
				-#define _STARPU_TRACE_RWLOCK_UNLOCKED()		do {} while(0)
			
 
				-#define _STARPU_TRACE_LOCKING_SPINLOCK(file, line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_SPINLOCK_LOCKED(file, line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file, line)	do {} while(0)
			
 
				-#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file, line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file, line)		do {} while(0)
			
 
				-#define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
			
 
				+#define _STARPU_TRACE_RWLOCK_UNLOCKED()			do {} while(0)
			
 
				+#define _STARPU_TRACE_LOCKING_SPINLOCK(file, line)	do {(void)(file); (void)(line);} while(0)
			
 
				+#define _STARPU_TRACE_SPINLOCK_LOCKED(file, line)	do {(void)(file); (void)(line);} while(0)
			
 
				+#define _STARPU_TRACE_UNLOCKING_SPINLOCK(file, line)	do {(void)(file); (void)(line);} while(0)
			
 
				+#define _STARPU_TRACE_SPINLOCK_UNLOCKED(file, line)	do {(void)(file); (void)(line);} while(0)
			
 
				+#define _STARPU_TRACE_TRYLOCK_SPINLOCK(file, line)	do {(void)(file); (void)(line);} while(0)
			
 
				+#define _STARPU_TRACE_COND_WAIT_BEGIN()			do {} while(0)
			
 
				 #define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
			
 
				 #define _STARPU_TRACE_BARRIER_WAIT_BEGIN()		do {} while(0)
			
 
				-#define _STARPU_TRACE_BARRIER_WAIT_END()			do {} while(0)
			
 
				-#define _STARPU_TRACE_MEMORY_FULL(size)				do {} while(0)
			
 
				-#define _STARPU_TRACE_DATA_LOAD(workerid,size)		do {} while(0)
			
 
				-#define _STARPU_TRACE_START_UNPARTITION(handle, memnode)	do {} while(0)
			
 
				-#define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		do {} while(0)
			
 
				-#define _STARPU_TRACE_SCHED_COMPONENT_PUSH_PRIO(workerid, ntasks, exp_len)	do {} while(0)
			
 
				-#define _STARPU_TRACE_SCHED_COMPONENT_POP_PRIO(workerid, ntasks, exp_len)	do {} while(0)
			
 
				-#define _STARPU_TRACE_HYPERVISOR_BEGIN()        do {} while(0)
			
 
				+#define _STARPU_TRACE_BARRIER_WAIT_END()		do {} while(0)
			
 
				+#define _STARPU_TRACE_MEMORY_FULL(size)			do {(void)(size);} while(0)
			
 
				+#define _STARPU_TRACE_DATA_LOAD(workerid,size)		do {(void)(workerid); (void)(size);} while(0)
			
 
				+#define _STARPU_TRACE_START_UNPARTITION(handle, memnode) 	do {(void)(handle); (void)(memnode);} while(0)
			
 
				+#define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		do {(void)(handle); (void)(memnode);} while(0)
			
 
				+#define _STARPU_TRACE_SCHED_COMPONENT_PUSH_PRIO(workerid, ntasks, exp_len)	do {(void)(workerid); (void)(ntasks); (void)(exp_len);} while(0)
			
 
				+#define _STARPU_TRACE_SCHED_COMPONENT_POP_PRIO(workerid, ntasks, exp_len)	do {(void)(workerid); (void)(ntasks); (void)(exp_len);} while(0)
			
 
				+#define _STARPU_TRACE_HYPERVISOR_BEGIN()        	do {} while(0)
			
 
				 #define _STARPU_TRACE_HYPERVISOR_END()                  do {} while(0)
			
 
				-#define _STARPU_TRACE_SCHED_COMPONENT_NEW(component)	do {} while (0)
			
 
				-#define _STARPU_TRACE_SCHED_COMPONENT_CONNECT(parent, child)	do {} while (0)
			
 
				-#define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task)	do {} while (0)
			
 
				-#define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)	do {} while (0)
			
 
				-#define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {} while (0)
			
 
				-#define _STARPU_TRACE_DATA_INVALIDATE(handle, node)	do {} while (0)
			
 
				-#define _STARPU_TRACE_WORKER_START_FETCH_INPUT(job, id)	do {} while(0)
			
 
				-#define _STARPU_TRACE_WORKER_END_FETCH_INPUT(job, id)	do {} while(0)
			
 
				+#define _STARPU_TRACE_SCHED_COMPONENT_NEW(component)	do {(void)(component);} while (0)
			
 
				+#define _STARPU_TRACE_SCHED_COMPONENT_CONNECT(parent, child)	do {(void)(parent); (void)(child);} while (0)
			
 
				+#define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task)	do {(void)(from); (void)(to); (void)(task);} while (0)
			
 
				+#define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)	do {(void)(from); (void)(to); (void)(task);} while (0)
			
 
				+#define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {(void)(handle);} while (0)
			
 
				+#define _STARPU_TRACE_DATA_INVALIDATE(handle, node)	do {(void)(handle); (void)(node);} while (0)
			
 
				+#define _STARPU_TRACE_WORKER_START_FETCH_INPUT(job, id)	do {(void)(job); (void)(id);} while(0)
			
 
				+#define _STARPU_TRACE_WORKER_END_FETCH_INPUT(job, id)	do {(void)(job); (void)(id);} while(0)
			
 
				 
			
 
				 #endif // STARPU_USE_FXT
			
 
				 
			
--- a/src/common/graph.c
+++ b/src/common/graph.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2016-2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -172,7 +172,11 @@ void _starpu_graph_add_job_dep(struct _starpu_job *job, struct _starpu_job *prev
 
				 	struct _starpu_graph_node *node = job->graph_node;
			
 
				 	struct _starpu_graph_node *prev_node = prev_job->graph_node;
			
 
				 	if (!node || !prev_node)
			
 
				+	{
			
 
				+		/* Already gone */
			
 
				+		_starpu_graph_wrunlock();
			
 
				 		return;
			
 
				+	}
			
 
				 
			
 
				 	if (_starpu_graph_node_multilist_queued_bottom(prev_node))
			
 
				 		/* Previous node is not at bottom any more */
			
--- a/src/core/dependencies/data_arbiter_concurrency.c
+++ b/src/core/dependencies/data_arbiter_concurrency.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2015-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2015  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -25,8 +25,6 @@
 
				 
			
 
				 /* TODO factorize with data_concurrency.c and btw support redux */
			
 
				 
			
 
				-/* TODO: fine-grain R/W access */
			
 
				-
			
 
				 //#define LOCK_OR_DELEGATE
			
 
				 
			
 
				 /*
			
@@ -44,6 +42,7 @@
 
				  * - for each handle h of T:
			
 
				  *   - mutex_lock(&arbiter)
			
 
				  *   - release reference on h
			
 
				+ *   - call _starpu_notify_arbitered_dependencies which does the following
			
 
				  *   - for each task Tc waiting for h:
			
 
				  *     - for each data Tc_h it is waiting for:
			
 
				  *       - if Tc_h is busy, goto fail
			
@@ -64,7 +63,7 @@
 
				  *   - mutex_unlock(&arbiter)
			
 
				  *
			
 
				  *
			
 
				- * at submission of task T:
			
 
				+ * at submission of task T (_starpu_submit_job_enforce_arbitered_deps):
			
 
				  *
			
 
				  * - mutex_lock(&arbiter)
			
 
				  * - for each handle h of T:
			
@@ -85,8 +84,26 @@
 
				  *   - unlock(h)
			
 
				  * - mutex_unlock(&arbiter)
			
 
				  * - return 1;
			
 
				+ *
			
 
				+ * at acquire (_starpu_attempt_to_submit_arbitered_data_request):
			
 
				+ * - mutex_lock(&arbiter)
			
 
				+ * - try to take a reference on h
			
 
				+ *   - on failure, record as waiting on h
			
 
				+ * - mutex_unlock(&arbiter);
			
 
				+ * - return 0 if succeeded, 1 if failed;
			
 
				  */
			
 
				 
			
 
				+static int _starpu_arbiter_filter_modes(int mode)
			
 
				+{
			
 
				+	/* Do not care about some flags */
			
 
				+	mode &= ~STARPU_COMMUTE;
			
 
				+	mode &= ~STARPU_SSEND;
			
 
				+	mode &= ~STARPU_LOCALITY;
			
 
				+	if (mode == STARPU_RW)
			
 
				+		mode = STARPU_W;
			
 
				+	return mode;
			
 
				+}
			
 
				+
			
 
				 struct starpu_arbiter
			
 
				 {
			
 
				 #ifdef LOCK_OR_DELEGATE
			
@@ -186,7 +203,7 @@ static int _starpu_LockOrDelegatePostOrPerform(starpu_arbiter_t arbiter, void (*
 
				 
			
 
				 #endif
			
 
				 
			
 
				-/* Try to submit a data request, in case the request can be processed
			
 
				+/* Try to submit just one data request, in case the request can be processed
			
 
				  * immediatly, return 0, if there is still a dependency that is not compatible
			
 
				  * with the current mode, the request is put in the per-handle list of
			
 
				  * "requesters", and this function returns 1. */
			
@@ -257,8 +274,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&arbiter->mutex);
			
 
				 #endif // LOCK_OR_DELEGATE
			
 
				 
			
 
				-	if (mode == STARPU_RW)
			
 
				-		mode = STARPU_W;
			
 
				+	mode = _starpu_arbiter_filter_modes(mode);
			
 
				 
			
 
				 	STARPU_ASSERT_MSG(!(mode & STARPU_REDUX), "REDUX with arbiter is not implemented\n");
			
 
				 
			
@@ -284,9 +300,21 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 
				 	/* If there is currently nobody accessing the piece of data, or it's
			
 
				 	 * not another writter and if this is the same type of access as the
			
 
				 	 * current one, we can proceed. */
			
 
				-	unsigned put_in_list;
			
 
				+	unsigned put_in_list = 1;
			
 
				 
			
 
				-	if (handle->refcnt)
			
 
				+	if (((handle->refcnt == 0) || (!(mode == STARPU_W) && (handle->current_mode == mode))))
			
 
				+	{
			
 
				+		/* TODO: Detect whether this is the end of a reduction phase etc. like in data_concurrency.c */
			
 
				+		if (0)
			
 
				+		{
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			put_in_list = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (put_in_list)
			
 
				 	{
			
 
				 		/* there cannot be multiple writers or a new writer
			
 
				 		 * while the data is in read mode */
			
@@ -394,7 +422,11 @@ void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned b
 
				 	for (idx_buf_arbiter = start_buf_arbiter; idx_buf_arbiter < nbuffers; idx_buf_arbiter++)
			
 
				 	{
			
 
				 		handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, idx_buf_arbiter);
			
 
				-		mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, idx_buf_arbiter);
			
 
				+		mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, idx_buf_arbiter) & ~STARPU_COMMUTE;
			
 
				+
			
 
				+		mode = _starpu_arbiter_filter_modes(mode);
			
 
				+
			
 
				+		STARPU_ASSERT_MSG(!(mode & STARPU_REDUX), "REDUX with arbiter is not implemented\n");
			
 
				 
			
 
				 		if (idx_buf_arbiter && (_STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, idx_buf_arbiter-1)==handle))
			
 
				 			/* We have already requested this data, skip it. This
			
@@ -410,12 +442,13 @@ void _starpu_submit_job_enforce_arbitered_deps(struct _starpu_job *j, unsigned b
 
				 
			
 
				 		/* Try to take handle */
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				-		if (handle->refcnt == 0)
			
 
				+		if (((handle->refcnt == 0) || (!(mode == STARPU_W) && (handle->current_mode == mode))))
			
 
				 		{
			
 
				 			/* Got it */
			
 
				 			handle->refcnt++;
			
 
				 			handle->busy_count++;
			
 
				-			handle->current_mode = mode;
			
 
				+			if (mode != STARPU_R || handle->current_mode != mode)
			
 
				+				handle->current_mode = mode;
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 		}
			
 
				 		else
			
@@ -528,22 +561,33 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
				 		{
			
 
				 			/* data_acquire_cb, process it */
			
 
				 			enum starpu_data_access_mode r_mode = r->mode;
			
 
				-			if (r_mode == STARPU_RW)
			
 
				-				r_mode = STARPU_W;
			
 
				+			int put_in_list = 1;
			
 
				+
			
 
				+			r_mode = _starpu_arbiter_filter_modes(r_mode);
			
 
				 
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
 
				-			handle->refcnt++;
			
 
				-			handle->busy_count++;
			
 
				-			handle->current_mode = r_mode;
			
 
				+			if (((handle->refcnt == 0) || (!(r_mode == STARPU_W) && (handle->current_mode == r_mode))))
			
 
				+			{
			
 
				+				handle->refcnt++;
			
 
				+				handle->busy_count++;
			
 
				+				handle->current_mode = r_mode;
			
 
				+				put_in_list = 0;
			
 
				+			}
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				+			if (put_in_list)
			
 
				+				_starpu_data_requester_list_push_front(&l, r);
			
 
				+
			
 
				 			/* Put back remaining requests */
			
 
				 			_starpu_data_requester_list_push_list_back(&handle->arbitered_req_list, &l);
			
 
				 #ifndef LOCK_OR_DELEGATE
			
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&arbiter->mutex);
			
 
				 #endif
			
 
				-			r->ready_data_callback(r->argcb);
			
 
				-			_starpu_data_requester_delete(r);
			
 
				+			if (!put_in_list)
			
 
				+			{
			
 
				+				r->ready_data_callback(r->argcb);
			
 
				+				_starpu_data_requester_delete(r);
			
 
				+			}
			
 
				 
			
 
				 			_starpu_spin_lock(&handle->header_lock);
			
 
				 			STARPU_ASSERT(handle->busy_count > 0);
			
@@ -578,10 +622,11 @@ void _starpu_notify_arbitered_dependencies(starpu_data_handle_t handle)
 
				 				break;
			
 
				 
			
 
				 			mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, idx_buf_arbiter);
			
 
				+			mode = _starpu_arbiter_filter_modes(mode);
			
 
				 
			
 
				 			/* we post all arbiter  */
			
 
				 			_starpu_spin_lock(&handle_arbiter->header_lock);
			
 
				-			if (handle_arbiter->refcnt != 0)
			
 
				+			if (!((handle_arbiter->refcnt == 0) || (!(mode == STARPU_W) && (handle_arbiter->current_mode == mode))))
			
 
				 			{
			
 
				 				/* handle is not available, record ourself */
			
 
				 				_starpu_spin_unlock(&handle_arbiter->header_lock);
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  * Copyright (C) 2015  Inria
			
 
				  *
			
@@ -118,6 +118,10 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
				 	if (handle->arbiter)
			
 
				 		return _starpu_attempt_to_submit_arbitered_data_request(request_from_codelet, handle, mode, callback, argcb, j, buffer_index);
			
 
				 
			
 
				+	/* Do not care about some flags */
			
 
				+	mode &= ~STARPU_COMMUTE;
			
 
				+	mode &= ~STARPU_SSEND;
			
 
				+	mode &= ~STARPU_LOCALITY;
			
 
				 	if (mode == STARPU_RW)
			
 
				 		mode = STARPU_W;
			
 
				 
			
--- a/src/core/drivers.c
+++ b/src/core/drivers.c
@@ -0,0 +1,74 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2010, 2011  INRIA
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				+ * Copyright (C) 2011-2012, 2016, 2017  INRIA
			
 
				+ * Copyright (C) 2016  Uppsala University
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <common/config.h>
			
 
				+#include <core/debug.h>
			
 
				+
			
 
				+int starpu_driver_init(struct starpu_driver *d)
			
 
				+{
			
 
				+	STARPU_ASSERT(d);
			
 
				+	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
			
 
				+
			
 
				+	if (worker->driver_ops == NULL)
			
 
				+		return -EINVAL;
			
 
				+	else
			
 
				+		return worker->driver_ops->init(worker);
			
 
				+}
			
 
				+
			
 
				+int starpu_driver_run(struct starpu_driver *d)
			
 
				+{
			
 
				+	if (!d)
			
 
				+	{
			
 
				+		_STARPU_DEBUG("Invalid argument\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
			
 
				+	if (worker->driver_ops == NULL)
			
 
				+		return -EINVAL;
			
 
				+	else
			
 
				+		return worker->driver_ops->run(worker);
			
 
				+}
			
 
				+
			
 
				+int starpu_driver_run_once(struct starpu_driver *d)
			
 
				+{
			
 
				+	STARPU_ASSERT(d);
			
 
				+	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
			
 
				+
			
 
				+	if (worker->driver_ops == NULL)
			
 
				+		return -EINVAL;
			
 
				+	else
			
 
				+		return worker->driver_ops->run_once(worker);
			
 
				+}
			
 
				+
			
 
				+int starpu_driver_deinit(struct starpu_driver *d)
			
 
				+{
			
 
				+	STARPU_ASSERT(d);
			
 
				+	struct _starpu_worker *worker = _starpu_get_worker_from_driver(d);
			
 
				+
			
 
				+	if (worker->driver_ops == NULL)
			
 
				+		return -EINVAL;
			
 
				+	else
			
 
				+		return worker->driver_ops->deinit(worker);
			
 
				+}
			
 
				+
			
--- a/src/core/drivers.h
+++ b/src/core/drivers.h
@@ -0,0 +1,31 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2011, 2016  INRIA
			
 
				+ * Copyright (C) 2016  Uppsala University
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DRIVERS_H__
			
 
				+#define __DRIVERS_H__
			
 
				+
			
 
				+struct _starpu_driver_ops
			
 
				+{
			
 
				+	int (*init)(struct _starpu_worker *worker);
			
 
				+	int (*run)(struct _starpu_worker *worker);
			
 
				+	int (*run_once)(struct _starpu_worker *worker);
			
 
				+	int (*deinit)(struct _starpu_worker *worker);
			
 
				+};
			
 
				+
			
 
				+#endif // __DRIVERS_H__
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2016, 2017  Inria
			
 
				  * Copyright (C) 2016  Uppsala University
			
@@ -30,6 +30,7 @@
 
				 #include <core/jobs.h>
			
 
				 #include <core/workers.h>
			
 
				 #include <datawizard/datawizard.h>
			
 
				+#include <core/task.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_WINDOWS
			
 
				 #include <windows.h>
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -37,6 +37,7 @@
 
				 #include <starpu_parameters.h>
			
 
				 #include <common/uthash.h>
			
 
				 #include <limits.h>
			
 
				+#include <core/task.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_WINDOWS
			
 
				 #include <windows.h>
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -19,6 +19,7 @@
 
				 #include <core/sched_ctx.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <stdarg.h>
			
 
				+#include <core/task.h>
			
 
				 
			
 
				 starpu_pthread_rwlock_t changing_ctx_mutex[STARPU_NMAX_SCHED_CTXS];
			
 
				 
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -24,6 +24,7 @@
 
				 #include <profiling/profiling.h>
			
 
				 #include <common/barrier.h>
			
 
				 #include <core/debug.h>
			
 
				+#include <core/task.h>
			
 
				 
			
 
				 static int use_prefetch = 0;
			
 
				 static double idle[STARPU_NMAXWORKERS];
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -281,10 +281,12 @@ int main(int argc, char **argv)
 
				 	return main_ret;
			
 
				 }
			
 
				 
			
 
				+#ifdef HAVE_MSG_PROCESS_ATTACH
			
 
				 static void maestro(void *data STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	MSG_main();
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				 void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
@@ -876,7 +878,12 @@ void _starpu_simgrid_count_ngpus(void)
 
				 		{
			
 
				 			int busid;
			
 
				 			msg_host_t srchost, dsthost;
			
 
				+#ifdef HAVE_SG_HOST_ROUTE
			
 
				+			xbt_dynar_t route_dynar = xbt_dynar_new(sizeof(SD_link_t), NULL);
			
 
				+			SD_link_t *route;
			
 
				+#else
			
 
				 			const SD_link_t *route;
			
 
				+#endif
			
 
				 			int i, routesize;
			
 
				 			int through;
			
 
				 			unsigned src2;
			
@@ -891,8 +898,14 @@ void _starpu_simgrid_count_ngpus(void)
 
				 
			
 
				 			srchost = _starpu_simgrid_get_memnode_host(src);
			
 
				 			dsthost = _starpu_simgrid_get_memnode_host(dst);
			
 
				+#ifdef HAVE_SG_HOST_ROUTE
			
 
				+			sg_host_route(srchost, dsthost, route_dynar);
			
 
				+			routesize = xbt_dynar_length(route_dynar);
			
 
				+			route = xbt_dynar_to_array(route_dynar);
			
 
				+#else
			
 
				 			routesize = SD_route_get_size(srchost, dsthost);
			
 
				 			route = SD_route_get_list(srchost, dsthost);
			
 
				+#endif
			
 
				 
			
 
				 			/* If it goes through "Host", do not care, there is no
			
 
				 			 * direct transfer support */
			
@@ -932,8 +945,17 @@ void _starpu_simgrid_count_ngpus(void)
 
				 				if (starpu_bus_get_id(src2, STARPU_MAIN_RAM) == -1)
			
 
				 					continue;
			
 
				 				msg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
			
 
				-				int routesize2 = SD_route_get_size(srchost2, ramhost);
			
 
				+				int routesize2;
			
 
				+#ifdef HAVE_SG_HOST_ROUTE
			
 
				+				xbt_dynar_t route_dynar2 = xbt_dynar_new(sizeof(SD_link_t), NULL);
			
 
				+				SD_link_t *route2;
			
 
				+				sg_host_route(srchost2, ramhost, route_dynar2);
			
 
				+				routesize2 = xbt_dynar_length(route_dynar2);
			
 
				+				route2 = xbt_dynar_to_array(route_dynar2);
			
 
				+#else
			
 
				 				const SD_link_t *route2 = SD_route_get_list(srchost2, ramhost);
			
 
				+				routesize2 = SD_route_get_size(srchost2, ramhost);
			
 
				+#endif
			
 
				 
			
 
				 				for (i = 0; i < routesize2; i++)
			
 
				 					if (!strcmp(name, sg_link_name(route2[i])))
			
@@ -942,9 +964,15 @@ void _starpu_simgrid_count_ngpus(void)
 
				 						ngpus++;
			
 
				 						break;
			
 
				 					}
			
 
				+#ifdef HAVE_SG_HOST_ROUTE
			
 
				+				free(route2);
			
 
				+#endif
			
 
				 			}
			
 
				 			_STARPU_DEBUG("%d->%d through %s, %u GPUs\n", src, dst, name, ngpus);
			
 
				 			starpu_bus_set_ngpus(busid, ngpus);
			
 
				+#ifdef HAVE_SG_HOST_ROUTE
			
 
				+			free(route);
			
 
				+#endif
			
 
				 		}
			
 
				 #endif
			
 
				 }
			
@@ -980,7 +1008,7 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code, vo
 
				 #ifdef HAVE_SMX_ACTOR_T
			
 
				 	smx_actor_t process STARPU_ATTRIBUTE_UNUSED;
			
 
				 #else
			
 
				-	smx_process_t process;
			
 
				+	smx_process_t process STARPU_ATTRIBUTE_UNUSED;
			
 
				 #endif
			
 
				 	thread_data_t *res;
			
 
				 	_STARPU_MALLOC(res, sizeof(thread_data_t));
			
@@ -1000,7 +1028,14 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code, vo
 
				 #else
			
 
				 	                         SIMIX_host_self(),
			
 
				 #endif
			
 
				-				 -1.0, 0, NULL,
			
 
				-	                         /*props */ NULL,0);
			
 
				+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
			
 
				+				 -1.0,
			
 
				+#endif
			
 
				+				 0, NULL,
			
 
				+	                         /*props */ NULL
			
 
				+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
			
 
				+				 , 0
			
 
				+#endif
			
 
				+				 );
			
 
				 }
			
 
				 #endif
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -97,8 +97,7 @@ static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
 
				 struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
			
 
				 #endif
			
 
				 
			
 
				-void *
			
 
				-_starpu_get_worker_from_driver(struct starpu_driver *d)
			
 
				+struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d)
			
 
				 {
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 	unsigned workerid;
			
@@ -131,7 +130,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
				 			case STARPU_CUDA_WORKER:
			
 
				 			{
			
 
				 				if (worker->devid == d->id.cuda_id)
			
 
				-					return worker->set;
			
 
				+					return worker;
			
 
				 				break;
			
 
				 
			
 
				 			}
			
@@ -907,6 +906,7 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 
				 	/* _starpu_initialize_workers_mic_deviceid (config); */
			
 
				 
			
 
				 	mic_worker_set[mic_idx].workers = &config->workers[topology->nworkers];
			
 
				+	mic_worker_set[mic_idx].nworkers = topology->nmiccores[mic_idx];
			
 
				 	unsigned miccore_id;
			
 
				 	for (miccore_id = 0; miccore_id < topology->nmiccores[mic_idx]; miccore_id++)
			
 
				 	{
			
@@ -970,6 +970,7 @@ _starpu_init_mpi_config (struct _starpu_machine_config *config,
 
				                         mpi_idx, topology->nmpicores[mpi_idx], topology->nworkers, STARPU_NMAXWORKERS);
			
 
				 
			
 
				         mpi_worker_set[mpi_idx].workers = &config->workers[topology->nworkers];
			
 
				+        mpi_worker_set[mpi_idx].nworkers = topology->nmpicores[mpi_idx];
			
 
				         unsigned mpicore_id;
			
 
				         for (mpicore_id = 0; mpicore_id < topology->nmpicores[mpi_idx]; mpicore_id++)
			
 
				         {
			
@@ -986,6 +987,7 @@ _starpu_init_mpi_config (struct _starpu_machine_config *config,
 
				                 config->workers[worker_idx].worker_mask = STARPU_MPI_MS;
			
 
				                 config->worker_mask |= STARPU_MPI_MS;
			
 
				         }
			
 
				+	mpi_ms_nodes[mpi_idx]->baseworkerid = topology->nworkers;
			
 
				 
			
 
				         topology->nworkers += topology->nmpicores[mpi_idx];
			
 
				 }
			
@@ -1234,31 +1236,68 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 
			
 
				 	/* Now we know how many CUDA devices will be used */
			
 
				 	topology->ncudagpus = ncuda;
			
 
				+	topology->nworkerpercuda = nworker_per_cuda;
			
 
				 	STARPU_ASSERT(topology->ncudagpus <= STARPU_MAXCUDADEVS);
			
 
				 
			
 
				 	_starpu_initialize_workers_cuda_gpuid(config);
			
 
				 
			
 
				 	/* allow having one worker per stream */
			
 
				-	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
			
 
				+	topology->cuda_th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", -1);
			
 
				+	topology->cuda_th_per_dev = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_DEV", -1);
			
 
				+
			
 
				+	/* per device by default */
			
 
				+	if (topology->cuda_th_per_dev == -1)
			
 
				+	{
			
 
				+		if (topology->cuda_th_per_stream == 1)
			
 
				+			topology->cuda_th_per_dev = 0;
			
 
				+		else
			
 
				+			topology->cuda_th_per_dev = 1;
			
 
				+	}
			
 
				+	/* Not per stream by default */
			
 
				+	if (topology->cuda_th_per_stream == -1)
			
 
				+	{
			
 
				+		topology->cuda_th_per_stream = 0;
			
 
				+	}
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(topology->cuda_th_per_dev != 1 || topology->cuda_th_per_stream != 1, "It does not make sense to set both STARPU_CUDA_THREAD_PER_WORKER and STARPU_CUDA_THREAD_PER_DEV to 1, please choose either per worker or per device or none");
			
 
				+
			
 
				+	if (!topology->cuda_th_per_dev)
			
 
				+	{
			
 
				+		cuda_worker_set[0].workers = &config->workers[topology->nworkers];
			
 
				+		cuda_worker_set[0].nworkers = topology->ncudagpus * nworker_per_cuda;
			
 
				+	}
			
 
				 
			
 
				 	unsigned cudagpu;
			
 
				 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
			
 
				 	{
			
 
				 		int devid = _starpu_get_next_cuda_gpuid(config);
			
 
				 		int worker_idx0 = topology->nworkers + cudagpu * nworker_per_cuda;
			
 
				-		cuda_worker_set[devid].workers = &config->workers[worker_idx0];
			
 
				+		struct _starpu_worker_set *worker_set;
			
 
				+
			
 
				+		if (topology->cuda_th_per_dev)
			
 
				+		{
			
 
				+			worker_set = &cuda_worker_set[devid];
			
 
				+			worker_set->workers = &config->workers[worker_idx0];
			
 
				+			worker_set->nworkers = nworker_per_cuda;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* Same worker set for all devices */
			
 
				+			worker_set = &cuda_worker_set[0];
			
 
				+		}
			
 
				 
			
 
				 		for (i = 0; i < nworker_per_cuda; i++)
			
 
				 		{
			
 
				 			int worker_idx = worker_idx0 + i;
			
 
				-			if(th_per_stream)
			
 
				+			if(topology->cuda_th_per_stream)
			
 
				 			{
			
 
				 				/* Just one worker in the set */
			
 
				 				config->workers[worker_idx].set = (struct _starpu_worker_set *)calloc(1, sizeof(struct _starpu_worker_set));
			
 
				 				config->workers[worker_idx].set->workers = &config->workers[worker_idx];
			
 
				+				config->workers[worker_idx].set->nworkers = 1;
			
 
				 			}
			
 
				 			else
			
 
				-				config->workers[worker_idx].set = &cuda_worker_set[devid];
			
 
				+				config->workers[worker_idx].set = worker_set;
			
 
				 
			
 
				 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
			
 
				 			_STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
			
@@ -1460,19 +1499,23 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 			for (j = 0; j < STARPU_MAXMICDEVS; j++)
			
 
				 				mic_busy_cpus += (topology->nmiccores[j] ? 1 : 0);
			
 
				 
			
 
				-            unsigned mpi_ms_busy_cpus = 0;
			
 
				+			unsigned mpi_ms_busy_cpus = 0;
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 #ifdef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				-            for (j = 0; j < STARPU_MAXMPIDEVS; j++)
			
 
				-                    mpi_ms_busy_cpus += (topology->nmpicores[j] ? 1 : 0);
			
 
				+			for (j = 0; j < STARPU_MAXMPIDEVS; j++)
			
 
				+				mpi_ms_busy_cpus += (topology->nmpicores[j] ? 1 : 0);
			
 
				 #else
			
 
				-            mpi_ms_busy_cpus = 1; /* we launch one thread to control all slaves */
			
 
				+			mpi_ms_busy_cpus = 1; /* we launch one thread to control all slaves */
			
 
				 #endif
			
 
				 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				-	    unsigned cuda_busy_cpus = 0;
			
 
				-#if defined(STARPU_USE_CUDA)
			
 
				-	    cuda_busy_cpus = th_per_stream ? (nworker_per_cuda * topology->ncudagpus) : 
			
 
				-		    topology->ncudagpus;
			
 
				+			unsigned cuda_busy_cpus = 0;
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				+			cuda_busy_cpus =
			
 
				+				topology->cuda_th_per_dev == 0 && topology->cuda_th_per_stream == 0 ?
			
 
				+					(topology->ncudagpus ? 1 : 0) :
			
 
				+				topology->cuda_th_per_stream ?
			
 
				+					(nworker_per_cuda * topology->ncudagpus) :
			
 
				+					topology->ncudagpus;
			
 
				 #endif
			
 
				 			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus 
			
 
				 				+ cuda_busy_cpus
			
@@ -1755,7 +1798,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
			
 
				 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
			
 
				 	unsigned cuda_bindid[STARPU_MAXCUDADEVS];
			
 
				-	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
			
 
				+	int cuda_globalbindid = -1;
			
 
				 #endif
			
 
				 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 	unsigned opencl_init[STARPU_MAXOPENCLDEVS] = { };
			
@@ -1846,17 +1889,22 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				if (cuda_init[devid])
			
 
				 				{
			
 
				 					memory_node = cuda_memory_nodes[devid];
			
 
				-#ifndef STARPU_SIMGRID
			
 
				-					if (th_per_stream == 0)
			
 
				+					if (config->topology.cuda_th_per_stream == 0)
			
 
				 						workerarg->bindid = cuda_bindid[devid];
			
 
				 					else
			
 
				 						workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				-#endif /* SIMGRID */
			
 
				 				}
			
 
				 				else
			
 
				 				{
			
 
				 					cuda_init[devid] = 1;
			
 
				-					workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+					if (config->topology.cuda_th_per_dev == 0 && config->topology.cuda_th_per_stream == 0)
			
 
				+					{
			
 
				+						if (cuda_globalbindid == -1)
			
 
				+							cuda_globalbindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				+						workerarg->bindid = cuda_bindid[devid] = cuda_globalbindid;
			
 
				+					}
			
 
				+					else
			
 
				+						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
			
 
				 
			
 
				 					/* TODO: NUMA nodes */
			
--- a/src/core/topology.h
+++ b/src/core/topology.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2010, 2012, 2014-2017  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2015  CNRS
			
 
				+ * Copyright (C) 2010, 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -69,7 +69,7 @@ struct _starpu_combined_worker;
 
				 /* Bind the current thread on the set of CPUs for the given combined worker. */
			
 
				 void _starpu_bind_thread_on_cpus(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED, struct _starpu_combined_worker *combined_worker);
			
 
				 
			
 
				-void *_starpu_get_worker_from_driver(struct starpu_driver *d);
			
 
				+struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d);
			
 
				 
			
 
				 int _starpu_numalogid_to_memnode(unsigned numalogid);
			
 
				 int _starpu_memnode_to_numalogid(unsigned memnode);
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -469,8 +469,7 @@ static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 
				 static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
			
 
				 					  struct starpu_driver *d)
			
 
				 {
			
 
				-	if (conf->n_not_launched_drivers == 0 ||
			
 
				-	    conf->not_launched_drivers == NULL)
			
 
				+	if (conf->n_not_launched_drivers == 0 || conf->not_launched_drivers == NULL)
			
 
				 		return 1;
			
 
				 
			
 
				 	/* Is <d> in conf->not_launched_drivers ? */
			
@@ -553,6 +552,7 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 
				 	/* name initialized by driver */
			
 
				 	/* short_name initialized by driver */
			
 
				 	workerarg->run_by_starpu = 1;
			
 
				+	workerarg->driver_ops = NULL;
			
 
				 
			
 
				 	workerarg->sched_ctx_list = NULL;
			
 
				 	workerarg->tmp_sched_ctx = -1;
			
@@ -612,8 +612,6 @@ void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 
				 	_starpu_fxt_register_thread(worker->bindid);
			
 
				 	_starpu_worker_start(worker, fut_key, sync);
			
 
				 #endif
			
 
				-	_starpu_memory_node_set_local_key(&worker->memory_node);
			
 
				-
			
 
				 	_starpu_set_local_worker_key(worker);
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
			
@@ -641,8 +639,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 	STARPU_HG_DISABLE_CHECKING(pconfig->watchdog_ok);
			
 
				 
			
 
				 	unsigned nworkers = pconfig->topology.nworkers;
			
 
				-
			
 
				-	/* Launch workers asynchronously */
			
 
				 	unsigned worker;
			
 
				 
			
 
				 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
			
@@ -651,10 +647,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 #endif
			
 
				 	STARPU_AYU_INIT();
			
 
				 
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				-	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
			
 
				-#endif
			
 
				-
			
 
				+	/* Launch workers asynchronously */
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
			
@@ -674,6 +667,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
			
 
				 			case STARPU_CPU_WORKER:
			
 
				 				driver.id.cpu_id = devid;
			
 
				+				workerarg->driver_ops = &_starpu_driver_cpu_ops;
			
 
				 				if (_starpu_may_launch_driver(&pconfig->conf, &driver))
			
 
				 				{
			
 
				 					STARPU_PTHREAD_CREATE_ON(
			
@@ -704,26 +698,13 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				 				driver.id.cuda_id = devid;
			
 
				+				workerarg->driver_ops = &_starpu_driver_cuda_ops;
			
 
				 
			
 
				 				if (worker_set->workers != workerarg)
			
 
				 					/* We are not the first worker of the
			
 
				 					 * set, don't start a thread for it. */
			
 
				 					break;
			
 
				 
			
 
				-				if(th_per_stream == 0)
			
 
				-				{
			
 
				-					worker_set->nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
			
 
				-#ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				-					if (worker_set->nworkers > 1)
			
 
				-					{
			
 
				-						_STARPU_DISP("Warning: reducing STARPU_NWORKER_PER_CUDA to 1 because blocking drivers are enabled\n");
			
 
				-						worker_set->nworkers = 1;
			
 
				-					}
			
 
				-#endif
			
 
				-				}
			
 
				-				else
			
 
				-					worker_set->nworkers = 1;
			
 
				-
			
 
				 				worker_set->set_is_initialized = 0;
			
 
				 
			
 
				 				if (!_starpu_may_launch_driver(&pconfig->conf, &driver))
			
@@ -734,7 +715,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 
			
 
				 
			
 
				 				STARPU_PTHREAD_CREATE_ON(
			
 
				-					workerarg->name,
			
 
				+					pconfig->topology.cuda_th_per_dev ? "CUDA" : workerarg->name,
			
 
				 					&worker_set->worker_thread,
			
 
				 					NULL,
			
 
				 					_starpu_cuda_worker,
			
@@ -752,6 +733,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 			case STARPU_OPENCL_WORKER:
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 				starpu_opencl_get_device(devid, &driver.id.opencl_id);
			
 
				+				workerarg->driver_ops = &_starpu_driver_opencl_ops;
			
 
				 				if (!_starpu_may_launch_driver(&pconfig->conf, &driver))
			
 
				 				{
			
 
				 					workerarg->run_by_starpu = 0;
			
@@ -781,8 +763,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 				if (worker_set->workers != workerarg)
			
 
				 					break;
			
 
				 
			
 
				-				worker_set->nworkers = pconfig->topology.nmiccores[devid];
			
 
				-
			
 
				 				worker_set->set_is_initialized = 0;
			
 
				 
			
 
				 				STARPU_PTHREAD_CREATE_ON(
			
@@ -837,14 +817,12 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 				if (worker_set->workers != workerarg)
			
 
				 					break;
			
 
				 
			
 
				-				worker_set->nworkers = pconfig->topology.nmpicores[devid];
			
 
				-
			
 
				 				worker_set->set_is_initialized = 0;
			
 
				 
			
 
				 #ifdef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				                 /* if MPI has multiple threads supports
			
 
				-                 * we launch 1 thread per device 
			
 
				-                 * else 
			
 
				+                 * we launch 1 thread per device
			
 
				+                 * else
			
 
				                  * we launch one thread for all devices
			
 
				                  */
			
 
				 				STARPU_PTHREAD_CREATE_ON(
			
@@ -916,70 +894,31 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
			
 
				-		struct starpu_driver driver;
			
 
				-		unsigned devid = workerarg->devid;
			
 
				-		driver.type = workerarg->arch;
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				-		struct _starpu_worker_set *worker_set = workerarg->set;
			
 
				-#endif
			
 
				 
			
 
				-		switch (workerarg->arch)
			
 
				-		{
			
 
				-			case STARPU_CPU_WORKER:
			
 
				-				driver.id.cpu_id = devid;
			
 
				-				if (!_starpu_may_launch_driver(&pconfig->conf, &driver))
			
 
				-					break;
			
 
				-				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
 
				-				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				-				while (!workerarg->worker_is_initialized)
			
 
				-					STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
 
				-				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				-				break;
			
 
				-#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				-			case STARPU_CUDA_WORKER:
			
 
				+		_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-				driver.id.cuda_id = devid;
			
 
				-				if (!_starpu_may_launch_driver(&pconfig->conf, &driver))
			
 
				-					break;
			
 
				-#endif
			
 
				-				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
 
				-				STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
			
 
				-				while (!worker_set->set_is_initialized)
			
 
				-					STARPU_PTHREAD_COND_WAIT(&worker_set->ready_cond,
			
 
				-								 &worker_set->mutex);
			
 
				-				STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
			
 
				-				worker_set->started = 1;
			
 
				-
			
 
				-				break;
			
 
				-#endif
			
 
				-#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				-			case STARPU_OPENCL_WORKER:
			
 
				-#ifndef STARPU_SIMGRID
			
 
				-				starpu_opencl_get_device(devid, &driver.id.opencl_id);
			
 
				-				if (!_starpu_may_launch_driver(&pconfig->conf, &driver))
			
 
				-					break;
			
 
				+		if (!workerarg->run_by_starpu)
			
 
				+			break;
			
 
				 #endif
			
 
				-				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
 
				-				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				-				while (!workerarg->worker_is_initialized)
			
 
				-					STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
 
				-				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				-				break;
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				+		if (workerarg->arch == STARPU_CUDA_WORKER)
			
 
				+		{
			
 
				+			struct _starpu_worker_set *worker_set = workerarg->set;
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
			
 
				+			while (!worker_set->set_is_initialized)
			
 
				+				STARPU_PTHREAD_COND_WAIT(&worker_set->ready_cond,
			
 
				+							 &worker_set->mutex);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
			
 
				+			worker_set->started = 1;
			
 
				+		}
			
 
				+		else
			
 
				 #endif
			
 
				-			case STARPU_MIC_WORKER:
			
 
				-                        case STARPU_MPI_MS_WORKER:
			
 
				-				/* Already waited above */
			
 
				-				break;
			
 
				-			case STARPU_SCC_WORKER:
			
 
				-				/* TODO: implement may_launch? */
			
 
				-				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
 
				-				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				-				while (!workerarg->worker_is_initialized)
			
 
				-					STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
 
				-				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				-				break;
			
 
				-			default:
			
 
				-				STARPU_ABORT();
			
 
				+		if (workerarg->arch != STARPU_CUDA_WORKER && workerarg->arch != STARPU_MPI_MS_WORKER && workerarg->arch != STARPU_MIC_WORKER)
			
 
				+		{
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				+			while (!workerarg->worker_is_initialized)
			
 
				+				STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -1736,7 +1675,7 @@ void starpu_shutdown(void)
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				     if (_starpu_mpi_common_is_mp_initialized())
			
 
				         _starpu_mpi_common_mp_deinit();
			
 
				-#endif 
			
 
				+#endif
			
 
				 	_starpu_print_idle_time();
			
 
				 	_STARPU_DEBUG("Shutdown finished\n");
			
 
				 
			
@@ -1770,7 +1709,7 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 
				 			return _starpu_config.topology.ncpus;
			
 
				 
			
 
				 		case STARPU_CUDA_WORKER:
			
 
				-			return _starpu_config.topology.ncudagpus;
			
 
				+			return _starpu_config.topology.ncudagpus * _starpu_config.topology.nworkerpercuda;
			
 
				 
			
 
				 		case STARPU_OPENCL_WORKER:
			
 
				 			return _starpu_config.topology.nopenclgpus;
			
@@ -1786,7 +1725,7 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 
				 
			
 
				                 case STARPU_ANY_WORKER:
			
 
				                         return _starpu_config.topology.ncpus+
			
 
				-                                _starpu_config.topology.ncudagpus+
			
 
				+				_starpu_config.topology.ncudagpus * _starpu_config.topology.nworkerpercuda+
			
 
				                                 _starpu_config.topology.nopenclgpus+
			
 
				                                 _starpu_config.topology.nmicdevices+
			
 
				                                 _starpu_config.topology.nsccdevices+
			
@@ -1808,7 +1747,7 @@ unsigned starpu_cpu_worker_get_count(void)
 
				 
			
 
				 unsigned starpu_cuda_worker_get_count(void)
			
 
				 {
			
 
				-	return _starpu_config.topology.ncudagpus;
			
 
				+	return _starpu_config.topology.ncudagpus * _starpu_config.topology.nworkerpercuda;
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_opencl_worker_get_count(void)
			
@@ -2219,117 +2158,6 @@ int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, in
 
				 	return cnt;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-int
			
 
				-starpu_driver_run(struct starpu_driver *d)
			
 
				-{
			
 
				-	if (!d)
			
 
				-	{
			
 
				-		_STARPU_DEBUG("Invalid argument\n");
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-
			
 
				-	void *worker = _starpu_get_worker_from_driver(d);
			
 
				-
			
 
				-	switch (d->type)
			
 
				-	{
			
 
				-#ifdef STARPU_USE_CPU
			
 
				-	case STARPU_CPU_WORKER:
			
 
				-		return _starpu_run_cpu(worker);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	case STARPU_CUDA_WORKER:
			
 
				-		return _starpu_run_cuda(worker);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	case STARPU_OPENCL_WORKER:
			
 
				-		return _starpu_run_opencl(worker);
			
 
				-#endif
			
 
				-	default:
			
 
				-		(void) worker;
			
 
				-		_STARPU_DEBUG("Invalid device type\n");
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int
			
 
				-starpu_driver_init(struct starpu_driver *d)
			
 
				-{
			
 
				-	STARPU_ASSERT(d);
			
 
				-	void *worker = _starpu_get_worker_from_driver(d);
			
 
				-
			
 
				-	switch (d->type)
			
 
				-	{
			
 
				-#ifdef STARPU_USE_CPU
			
 
				-	case STARPU_CPU_WORKER:
			
 
				-		return _starpu_cpu_driver_init(worker);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	case STARPU_CUDA_WORKER:
			
 
				-		return _starpu_cuda_driver_init(worker);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	case STARPU_OPENCL_WORKER:
			
 
				-		return _starpu_opencl_driver_init(worker);
			
 
				-#endif
			
 
				-	default:
			
 
				-		(void) worker;
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int
			
 
				-starpu_driver_run_once(struct starpu_driver *d)
			
 
				-{
			
 
				-	STARPU_ASSERT(d);
			
 
				-	void *worker = _starpu_get_worker_from_driver(d);
			
 
				-
			
 
				-	switch (d->type)
			
 
				-	{
			
 
				-#ifdef STARPU_USE_CPU
			
 
				-	case STARPU_CPU_WORKER:
			
 
				-		return _starpu_cpu_driver_run_once(worker);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	case STARPU_CUDA_WORKER:
			
 
				-		return _starpu_cuda_driver_run_once(worker);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	case STARPU_OPENCL_WORKER:
			
 
				-		return _starpu_opencl_driver_run_once(worker);
			
 
				-#endif
			
 
				-	default:
			
 
				-		(void) worker;
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int
			
 
				-starpu_driver_deinit(struct starpu_driver *d)
			
 
				-{
			
 
				-	STARPU_ASSERT(d);
			
 
				-	void *worker = _starpu_get_worker_from_driver(d);
			
 
				-
			
 
				-	switch (d->type)
			
 
				-	{
			
 
				-#ifdef STARPU_USE_CPU
			
 
				-	case STARPU_CPU_WORKER:
			
 
				-		return _starpu_cpu_driver_deinit(worker);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	case STARPU_CUDA_WORKER:
			
 
				-		return _starpu_cuda_driver_deinit(worker);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	case STARPU_OPENCL_WORKER:
			
 
				-		return _starpu_opencl_driver_deinit(worker);
			
 
				-#endif
			
 
				-	default:
			
 
				-		(void) worker;
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 void starpu_get_version(int *major, int *minor, int *release)
			
 
				 {
			
 
				 	*major = STARPU_MAJOR_VERSION;
			
@@ -2417,4 +2245,23 @@ unsigned starpu_worker_get_sched_ctx_id_stream(unsigned stream_workerid)
 
				 	return w->stream_ctx != NULL ? w->stream_ctx->id : STARPU_NMAX_SCHED_CTXS;
			
 
				 }
			
 
				 
			
 
				-
			
 
				+void starpu_worker_display_names(FILE *output, enum starpu_worker_archtype type)
			
 
				+{
			
 
				+	int nworkers = starpu_worker_get_count_by_type(type);
			
 
				+	if (nworkers <= 0)
			
 
				+	{
			
 
				+		fprintf(output, "No %s worker\n", starpu_worker_get_type_as_string(type));
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		int i, ids[nworkers];
			
 
				+		starpu_worker_get_ids_by_type(type, ids, nworkers);
			
 
				+		fprintf(output, "%d %s worker%s:\n", nworkers, starpu_worker_get_type_as_string(type), nworkers==1?"":"s");
			
 
				+		for(i = 0; i < nworkers; i++)
			
 
				+		{
			
 
				+			char name[256];
			
 
				+			starpu_worker_get_name(ids[i], name, 256);
			
 
				+			fprintf(output, "\t%s\n", name);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -36,6 +36,7 @@
 
				 #include <hwloc.h>
			
 
				 #endif
			
 
				 
			
 
				+#include <core/drivers.h>
			
 
				 #include <drivers/cuda/driver_cuda.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
@@ -104,6 +105,7 @@ LIST_TYPE(_starpu_worker,
 
				 	char name[64];
			
 
				 	char short_name[10];
			
 
				 	unsigned run_by_starpu; /* Is this run by StarPU or directly by the application ? */
			
 
				+	struct _starpu_driver_ops *driver_ops;
			
 
				 
			
 
				 	struct _starpu_sched_ctx_list *sched_ctx_list;
			
 
				 	int tmp_sched_ctx;
			
@@ -241,8 +243,11 @@ struct _starpu_machine_topology
 
				 	/* Actual number of CPU workers used by StarPU. */
			
 
				 	unsigned ncpus;
			
 
				 
			
 
				-	/* Actual number of CUDA workers used by StarPU. */
			
 
				+	/* Actual number of CUDA GPUs used by StarPU. */
			
 
				 	unsigned ncudagpus;
			
 
				+	unsigned nworkerpercuda;
			
 
				+	int cuda_th_per_stream;
			
 
				+	int cuda_th_per_dev;
			
 
				 
			
 
				 	/* Actual number of OpenCL workers used by StarPU. */
			
 
				 	unsigned nopenclgpus;
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c