12 years ago · 209fb7bf53
--- a/configure.ac
+++ b/configure.ac
@@ -785,14 +785,40 @@ AC_DEFUN([STARPU_LOOK_FOR_OPENCL],
 
				 ])
			
 
				 
			
 
				 if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
			
 
				-	STARPU_LOOK_FOR_OPENCL()
			
 
				-	# in case OpenCL was explicitely required, but is not available, this is an error
			
 
				-	if test x$enable_opencl = xyes -a x$have_valid_opencl = xno; then
			
 
				-	    AC_MSG_ERROR([cannot find OpenCL])
			
 
				-	fi
			
 
				+   case $target in
			
 
				+        *-*-darwin*)
			
 
				+          AC_MSG_CHECKING(whether OpenCL is available)
			
 
				 
			
 
				-	# now we enable OpenCL if and only if a proper setup is available
			
 
				-	enable_opencl=$have_valid_opencl
			
 
				+          SAVED_LIBS=$LIBS
			
 
				+          LIBS="$LIBS -framework OpenCL"
			
 
				+          AC_LINK_IFELSE(
			
 
				+          [AC_LANG_PROGRAM([[
			
 
				+          #ifdef __APPLE_CC__
			
 
				+          #include <OpenCL/opencl.h>
			
 
				+          #else
			
 
				+          #include <CL/cl.h>
			
 
				+          #endif
			
 
				+          ]],
			
 
				+            [[return clSetKernelArg(0, 0, 0, 0); ]])],
			
 
				+          [AC_MSG_RESULT(yes)
			
 
				+            enable_opencl=yes
			
 
				+            have_valid_opencl=yes
			
 
				+            STARPU_OPENCL_CPPFLAGS=
			
 
				+            STARPU_OPENCL_LDFLAGS="-framework OpenCL"],
			
 
				+          [AC_MSG_RESULT(no)
			
 
				+             enable_opencl=no])
			
 
				+          LIBS=$SAVED_LIBS
			
 
				+          ;;        
			
 
				+        *)
			
 
				+	  STARPU_LOOK_FOR_OPENCL()
			
 
				+	  # in case OpenCL was explicitely required, but is not available, this is an error
			
 
				+	  if test x$enable_opencl = xyes -a x$have_valid_opencl = xno; then
			
 
				+	    AC_MSG_ERROR([cannot find OpenCL])
			
 
				+	  fi
			
 
				+	  # now we enable OpenCL if and only if a proper setup is available
			
 
				+	  enable_opencl=$have_valid_opencl
			
 
				+          ;;
			
 
				+   esac
			
 
				 fi
			
 
				 
			
 
				 AC_MSG_CHECKING(whether OpenCL should be used)
			
@@ -2203,8 +2229,6 @@ AC_OUTPUT([
 
				 	socl/Makefile
			
 
				 	socl/src/Makefile
			
 
				 	socl/examples/Makefile
			
 
				-        socl/socl-1.0.pc
			
 
				-	socl/socl-1.1.pc
			
 
				 	socl/vendors/socl.icd
			
 
				 	libstarpu.pc
			
 
				 	starpu-1.0.pc
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -44,6 +44,7 @@ chapters =	\
 
				 	chapters/mic_scc_support.doxy \
			
 
				 	chapters/code/hello_pragma2.c \
			
 
				 	chapters/code/hello_pragma.c \
			
 
				+	chapters/code/scal_pragma.cu \
			
 
				 	chapters/code/matmul_pragma.c \
			
 
				 	chapters/code/matmul_pragma2.c \
			
 
				 	chapters/code/cholesky_pragma.c \
			
--- a/doc/doxygen/chapters/advanced_examples.doxy
+++ b/doc/doxygen/chapters/advanced_examples.doxy
@@ -92,12 +92,12 @@ thus be very fast. The function starpu_cuda_get_device_properties()
 
				 provides a quick access to CUDA properties of CUDA devices to achieve
			
 
				 such efficiency.
			
 
				 
			
 
				-Another example is compiling CUDA code for various compute capabilities,
			
 
				+Another example is to compile CUDA code for various compute capabilities,
			
 
				 resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
			
 
				 1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
			
 
				-provided to StarPU by using <c>cuda_funcs</c>, and <c>can_execute</c> can then be
			
 
				-used to rule out the <c>scal_gpu_20</c> variant on a CUDA device which
			
 
				-will not be able to execute it:
			
 
				+provided to StarPU by using starpu_codelet::cuda_funcs, and
			
 
				+starpu_codelet::can_execute can then be used to rule out the
			
 
				+<c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
			
 
				 
			
 
				 \code{.c}
			
 
				 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
@@ -390,9 +390,9 @@ starpu_perfmodel::size_base however permits the application to
 
				 override that, when for instance some of the data do not matter for
			
 
				 task cost (e.g. mere reference table), or when using sparse
			
 
				 structures (in which case it is the number of non-zeros which matter), or when
			
 
				-there is some hidden parameter such as the number of iterations, etc. The
			
 
				-<c>examples/pi</c> examples uses this to include the number of iterations in the
			
 
				-base.
			
 
				+there is some hidden parameter such as the number of iterations, etc.
			
 
				+The example in the directory <c>examples/pi</c> uses this to include
			
 
				+the number of iterations in the base.
			
 
				 
			
 
				 How to use schedulers which can benefit from such performance model is explained
			
 
				 in \ref TaskSchedulingPolicy.
			
@@ -427,11 +427,11 @@ starpu_bound_print_lp() or starpu_bound_print_mps() can then be used
 
				 to output a Linear Programming problem corresponding to the schedule
			
 
				 of your tasks. Run it through <c>lp_solve</c> or any other linear
			
 
				 programming solver, and that will give you a lower bound for the total
			
 
				-execution time of your tasks. If StarPU was compiled with the glpk
			
 
				-library installed, starpu_bound_compute() can be used to solve it
			
 
				+execution time of your tasks. If StarPU was compiled with the library
			
 
				+<c>glpk</c> installed, starpu_bound_compute() can be used to solve it
			
 
				 immediately and get the optimized minimum, in ms. Its parameter
			
 
				 <c>integer</c> allows to decide whether integer resolution should be
			
 
				-computed and returned too.
			
 
				+computed and returned 
			
 
				 
			
 
				 The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
			
 
				 data, and tag dependencies into account. Tags released in a callback
			
@@ -549,7 +549,7 @@ STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
 
				 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
			
 
				 acquiring data <c>i</c> for the main application, and will execute the code
			
 
				 given as third parameter when it is acquired. In other words, as soon as the
			
 
				-value of <c>i</c> computed by the <c>which_index</c> codelet can be read, the
			
 
				+value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
			
 
				 portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
			
 
				 be executed, and is allowed to read from <c>i</c> to use it e.g. as an
			
 
				 index. Note that this macro is only avaible when compiling StarPU with
			
@@ -609,7 +609,7 @@ struct starpu_codelet accumulate_variable_cl =
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-and attaches them as reduction methods for its <c>dtq</c> handle:
			
 
				+and attaches them as reduction methods for its handle <c>dtq</c>:
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
			
@@ -674,7 +674,7 @@ tasks.
 
				 Data can sometimes be entirely produced by a task, and entirely consumed by
			
 
				 another task, without the need for other parts of the application to access
			
 
				 it. In such case, registration can be done without prior allocation, by using
			
 
				-the special -1 memory node number, and passing a zero pointer. StarPU will
			
 
				+the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
			
 
				 actually allocate memory only when the task creating the content gets scheduled,
			
 
				 and destroy it on unregistration.
			
 
				 
			
@@ -704,9 +704,8 @@ function, and free it at the end, but that would be costly. It could also
 
				 allocate one buffer per worker (similarly to \ref
			
 
				 HowToInitializeAComputationLibraryOnceForEachWorker), but that would
			
 
				 make them systematic and permanent. A more  optimized way is to use
			
 
				-the ::STARPU_SCRATCH data access mode, as examplified below,
			
 
				-
			
 
				-which provides per-worker buffers without content consistency.
			
 
				+the data access mode ::STARPU_SCRATCH, as examplified below, which
			
 
				+provides per-worker buffers without content consistency.
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
			
@@ -723,7 +722,7 @@ the other on the same worker. Also, if for instance GPU memory becomes scarce,
 
				 StarPU will notice that it can free such buffers easily, since the content does
			
 
				 not matter.
			
 
				 
			
 
				-The <c>examples/pi</c> example uses scratches for some temporary buffer.
			
 
				+The example <c>examples/pi</c> uses scratches for some temporary buffer.
			
 
				 
			
 
				 \section ParallelTasks Parallel Tasks
			
 
				 
			
@@ -734,8 +733,9 @@ parallel CPU implementation of the computation to be achieved. This can also be
 
				 useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
			
 
				 work collectively on a single task, the completion time of tasks on CPUs become
			
 
				 comparable to the completion time on GPUs, thus relieving from granularity
			
 
				-discrepancy concerns. Hwloc support needs to be enabled to get good performance,
			
 
				-otherwise StarPU will not know how to better group cores.
			
 
				+discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
			
 
				+good performance, otherwise StarPU will not know how to better group
			
 
				+cores.
			
 
				 
			
 
				 Two modes of execution exist to accomodate with existing usages.
			
 
				 
			
@@ -808,8 +808,8 @@ buffer.
 
				 
			
 
				 To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
			
 
				 be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
			
 
				-::STARPU_SPMD, the <c>pheft</c> (parallel-heft) and <c>peager</c>
			
 
				-(parallel eager) schedulers will indeed also try to execute tasks with
			
 
				+::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
			
 
				+(parallel eager) will indeed also try to execute tasks with
			
 
				 several CPUs. It will automatically try the various available combined
			
 
				 worker sizes (making several measurements for each worker size) and
			
 
				 thus be able to avoid choosing a large combined worker if the codelet
			
@@ -846,9 +846,9 @@ from different threads, due to the use of global variables in their sequential
 
				 sections for instance.
			
 
				 
			
 
				 The solution is then to use only one combined worker at a time.  This can be
			
 
				-done by setting the field starpu_conf::single_combined_worker to 1, or
			
 
				+done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
			
 
				 setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
			
 
				-to 1. StarPU will then run only one parallel task at a time (but other
			
 
				+to <c>1</c>. StarPU will then run only one parallel task at a time (but other
			
 
				 CPU and GPU tasks are not affected and can be run concurrently). The parallel
			
 
				 task scheduler will however still however still try varying combined worker
			
 
				 sizes to look for the most efficient ones.
			
@@ -1183,8 +1183,8 @@ directory <c>examples/basic_examples/dynamic_handles.c</c>.
 
				 
			
 
				 \section MoreExamples More Examples
			
 
				 
			
 
				-More examples are available in the StarPU sources in the <c>examples/</c>
			
 
				-directory. Simple examples include:
			
 
				+More examples are available in the StarPU sources in the directory
			
 
				+<c>examples/</c>. Simple examples include:
			
 
				 
			
 
				 <dl>
			
 
				 <dt> <c>incrementer/</c> </dt>
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -569,7 +569,7 @@ starpu_codelet::modes or the \p i th element of the field
 
				 starpu_codelet::dyn_modes (see \ref
			
 
				 SettingTheDataHandlesForATask)
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_create(void)
			
 
				+\fn struct starpu_task *starpu_task_create(void)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Allocate a task structure and initialize it with default
			
 
				 values. Tasks allocated dynamically with starpu_task_create() are
			
@@ -580,7 +580,7 @@ wait) and thus freed at any time. If the field starpu_task::destroy is
 
				 explicitly unset, the resources used by the task have to be freed by
			
 
				 calling starpu_task_destroy().
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_dup(struct starpu_task *task)
			
 
				+\fn struct starpu_task *starpu_task_dup(struct starpu_task *task)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Allocate a task structure which is the exact duplicate of the
			
 
				 given task.
			
@@ -657,7 +657,7 @@ Return the number of submitted tasks which are ready for
 
				 execution are already executing. It thus does not include tasks
			
 
				 waiting for dependencies.
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_get_current(void)
			
 
				+\fn struct starpu_task *starpu_task_get_current(void)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 This function returns the task currently executed by the
			
 
				 worker, or <c>NULL</c> if it is called either from a thread that is not a
			
@@ -681,7 +681,7 @@ codelet implementation to be executed when executing the task.
 
				 This function return the codelet implementation to be executed
			
 
				 when executing the task.
			
 
				 
			
 
				-\fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps,	void (*callback)(void *), void *callback_arg)
			
 
				+\fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 This creates (and submits) an empty task that unlocks a tag once all
			
 
				 its dependencies are fulfilled.
			
--- a/doc/doxygen/chapters/api/cuda_extensions.doxy
+++ b/doc/doxygen/chapters/api/cuda_extensions.doxy
@@ -26,7 +26,7 @@ create its own streams. Synchronizing with cudaThreadSynchronize() is
 
				 allowed, but will reduce the likelihood of having all transfers
			
 
				 overlapped.
			
 
				 
			
 
				-\fn const struct cudaDeviceProp * starpu_cuda_get_device_properties(unsigned workerid)
			
 
				+\fn const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 This function returns a pointer to device properties for worker
			
 
				 \p workerid (assumed to be a CUDA worker).
			
@@ -35,11 +35,11 @@ This function returns a pointer to device properties for worker
 
				 \ingroup API_CUDA_Extensions
			
 
				 Report a CUDA error.
			
 
				 
			
 
				-\def STARPU_CUDA_REPORT_ERROR (cudaError_t status)
			
 
				+\def STARPU_CUDA_REPORT_ERROR(cudaError_t status)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 Calls starpu_cuda_report_error(), passing the current function, file and line position.
			
 
				 
			
 
				-\fn int starpu_cuda_copy_async_sync (void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+\fn int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 Copy \p ssize bytes from the pointer \p src_ptr on \p src_node
			
 
				 to the pointer \p dst_ptr on \p dst_node. The function first tries to
			
@@ -72,7 +72,7 @@ every CUDA device.
 
				 \ingroup API_CUDA_Extensions
			
 
				 Report a cublas error.
			
 
				 
			
 
				-\def STARPU_CUBLAS_REPORT_ERROR (cublasStatus status)
			
 
				+\def STARPU_CUBLAS_REPORT_ERROR(cublasStatus status)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 Calls starpu_cublas_report_error(), passing the current
			
 
				 function, file and line position.
			
--- a/doc/doxygen/chapters/api/data_interfaces.doxy
+++ b/doc/doxygen/chapters/api/data_interfaces.doxy
@@ -671,12 +671,12 @@ row pointers...) of the matrix desginated by \p handle.
 
				 Return a pointer to the non-zero values of the matrix
			
 
				 designated by \p handle.
			
 
				 
			
 
				-\fn uint32_t * starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
			
 
				+\fn uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the column index, which holds the positions
			
 
				 of the non-zero entries in the matrix designated by \p handle.
			
 
				 
			
 
				-\fn uint32_t * starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				+\fn uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the row pointer array of the matrix designated by
			
 
				 \p handle.
			
@@ -780,12 +780,12 @@ row pointers...) of the matrix designated by \p handle.
 
				 Return a local pointer to the non-zero values of the matrix
			
 
				 designated by \p handle.
			
 
				 
			
 
				-\fn uint32_t * starpu_csr_get_local_colind(starpu_data_handle_t handle)
			
 
				+\fn uint32_t *starpu_csr_get_local_colind(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a local pointer to the column index of the matrix
			
 
				 designated by \p handle.
			
 
				 
			
 
				-\fn uint32_t * starpu_csr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				+\fn uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a local pointer to the row pointer array of the matrix
			
 
				 designated by \p handle.
			
--- a/doc/doxygen/chapters/api/data_partition.doxy
+++ b/doc/doxygen/chapters/api/data_partition.doxy
@@ -71,7 +71,7 @@ This function returns the number of children.
 
				 Return the ith child of the given \p handle, which must have been
			
 
				 partitionned beforehand.
			
 
				 
			
 
				-\fn starpu_data_handle_t starpu_data_get_sub_data (starpu_data_handle_t root_data, unsigned depth, ... )
			
 
				+\fn starpu_data_handle_t starpu_data_get_sub_data(starpu_data_handle_t root_data, unsigned depth, ... )
			
 
				 \ingroup API_Data_Partition
			
 
				 After partitioning a StarPU data by applying a filter,
			
 
				 starpu_data_get_sub_data() can be used to get handles for each of the
			
@@ -192,13 +192,13 @@ functions for block data. Examples on how to use them are shown in
 
				 <c>starpu_data_filters.h</c>. A usage example is available in
			
 
				 examples/filters/shadow3d.c
			
 
				 
			
 
				-\fn void starpu_block_filter_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the X dimension, thus getting
			
 
				 (x/\p nparts ,y,z) 3D matrices. If \p nparts does not divide x, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				-\fn void starpu_block_filter_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the X dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
@@ -207,13 +207,13 @@ divide x, the last submatrix contains the remainder. <b>IMPORTANT</b>:
 
				 This can only be used for read-only access, as no coherency is
			
 
				 enforced for the shadowed parts.
			
 
				 
			
 
				-\fn void starpu_block_filter_vertical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the Y dimension, thus getting
			
 
				 (x,y/\p nparts ,z) blocks. If \p nparts does not divide y, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				-\fn void starpu_block_filter_vertical_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the Y dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
@@ -222,13 +222,13 @@ divide y, the last submatrix contains the remainder. <b>IMPORTANT</b>:
 
				 This can only be used for read-only access, as no coherency is
			
 
				 enforced for the shadowed parts.
			
 
				 
			
 
				-\fn void starpu_block_filter_depth_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the Z dimension, thus getting
			
 
				 (x,y,z/\p nparts) blocks. If \p nparts does not divide z, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				-\fn void starpu_block_filter_depth_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the Z dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
@@ -245,11 +245,11 @@ functions for BCSR data. Examples on how to use them are shown in
 
				 \ref PartitioningData. The complete list can be found in the file
			
 
				 <c>starpu_data_filters.h</c>.
			
 
				 
			
 
				-\fn void starpu_bcsr_filter_canonical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block-sparse matrix into dense matrices.
			
 
				 
			
 
				-\fn void starpu_csr_filter_vertical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block-sparse matrix into vertical
			
 
				 block-sparse matrices.
			
--- a/doc/doxygen/chapters/api/explicit_dependencies.doxy
+++ b/doc/doxygen/chapters/api/explicit_dependencies.doxy
@@ -99,7 +99,7 @@ This function releases the resources associated to tag \p id.
 
				 It can be called once the corresponding task has been executed and
			
 
				 when there is no other tag that depend on this tag anymore.
			
 
				 
			
 
				-\fn void starpu_tag_notify_from_apps (starpu_tag_t id)
			
 
				+\fn void starpu_tag_notify_from_apps(starpu_tag_t id)
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				 This function explicitly unlocks tag \p id. It may be useful in
			
 
				 the case of applications which execute part of their computation
			
--- a/doc/doxygen/chapters/api/insert_task.doxy
+++ b/doc/doxygen/chapters/api/insert_task.doxy
@@ -90,7 +90,7 @@ Pack arguments of type ::STARPU_VALUE into a buffer which can be
 
				 given to a codelet and later unpacked with the function
			
 
				 starpu_codelet_unpack_args().
			
 
				 
			
 
				-\fn void starpu_codelet_unpack_args (void *cl_arg, ...)
			
 
				+\fn void starpu_codelet_unpack_args(void *cl_arg, ...)
			
 
				 \ingroup API_Insert_Task
			
 
				 Retrieve the arguments of type ::STARPU_VALUE associated to a
			
 
				 task automatically created using the function starpu_insert_task().
			
--- a/doc/doxygen/chapters/api/lower_bound.doxy
+++ b/doc/doxygen/chapters/api/lower_bound.doxy
@@ -11,36 +11,36 @@
 
				 \brief Compute theoretical upper computation efficiency bound
			
 
				 corresponding to some actual execution.
			
 
				 
			
 
				-\fn void starpu_bound_start (int deps, int prio)
			
 
				+\fn void starpu_bound_start(int deps, int prio)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Start recording tasks (resets stats). \p deps tells whether
			
 
				 dependencies should be recorded too (this is quite expensive)
			
 
				 
			
 
				-\fn void starpu_bound_stop (void)
			
 
				+\fn void starpu_bound_stop(void)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Stop recording tasks
			
 
				 
			
 
				-\fn void starpu_bound_print_dot (FILE *output)
			
 
				+\fn void starpu_bound_print_dot(FILE *output)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Print the DAG that was recorded
			
 
				 
			
 
				-\fn void starpu_bound_compute (double *res, double *integer_res, int integer)
			
 
				+\fn void starpu_bound_compute(double *res, double *integer_res, int integer)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Get theoretical upper bound (in ms) (needs glpk support
			
 
				 detected by configure script). It returns 0 if some performance models
			
 
				 are not calibrated.
			
 
				 
			
 
				-\fn void starpu_bound_print_lp (FILE *output)
			
 
				+\fn void starpu_bound_print_lp(FILE *output)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Emit the Linear Programming system on \p output for the recorded
			
 
				 tasks, in the lp format
			
 
				 
			
 
				-\fn void starpu_bound_print_mps (FILE *output)
			
 
				+\fn void starpu_bound_print_mps(FILE *output)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Emit the Linear Programming system on \p output for the recorded
			
 
				 tasks, in the mps format
			
 
				 
			
 
				-\fn void starpu_bound_print (FILE *output, int integer)
			
 
				+\fn void starpu_bound_print(FILE *output, int integer)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Emit statistics of actual execution vs theoretical upper bound.
			
 
				 \p integer permits to choose between integer solving (which takes a
			
--- a/doc/doxygen/chapters/api/mpi.doxy
+++ b/doc/doxygen/chapters/api/mpi.doxy
@@ -11,21 +11,21 @@
 
				 @name Initialisation
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn int starpu_mpi_init (int *argc, char ***argv, int initialize_mpi)
			
 
				+\fn int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
			
 
				 \ingroup API_MPI_Support
			
 
				 Initializes the starpumpi library. \p initialize_mpi indicates if MPI
			
 
				 should be initialized or not by StarPU. If the value is not 0, MPI
			
 
				 will be initialized by calling <c>MPI_Init_Thread(argc, argv,
			
 
				 MPI_THREAD_SERIALIZED, ...)</c>.
			
 
				 
			
 
				-\fn int starpu_mpi_initialize (void)
			
 
				+\fn int starpu_mpi_initialize(void)
			
 
				 \deprecated
			
 
				 \ingroup API_MPI_Support
			
 
				 This function has been made deprecated. One should use instead the
			
 
				 function starpu_mpi_init(). This function does not call MPI_Init(), it
			
 
				 should be called beforehand.
			
 
				 
			
 
				-\fn int starpu_mpi_initialize_extended (int *rank, int *world_size)
			
 
				+\fn int starpu_mpi_initialize_extended(int *rank, int *world_size)
			
 
				 \deprecated
			
 
				 \ingroup API_MPI_Support
			
 
				 This function has been made deprecated. One should use instead the
			
@@ -33,13 +33,13 @@ function starpu_mpi_init(). MPI will be initialized by starpumpi by
 
				 calling <c>MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED,
			
 
				 ...)</c>.
			
 
				 
			
 
				-\fn int starpu_mpi_shutdown (void)
			
 
				+\fn int starpu_mpi_shutdown(void)
			
 
				 \ingroup API_MPI_Support
			
 
				 Cleans the starpumpi library. This must be called between calling
			
 
				 starpu_mpi functions and starpu_shutdown(). MPI_Finalize() will be
			
 
				 called if StarPU-MPI has been initialized by starpu_mpi_init().
			
 
				 
			
 
				-\fn void starpu_mpi_comm_amounts_retrieve (size_t *comm_amounts)
			
 
				+\fn void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
			
 
				 \ingroup API_MPI_Support
			
 
				 Retrieve the current amount of communications from the current node in
			
 
				 the array \p comm_amounts which must have a size greater or equal to
			
@@ -50,33 +50,33 @@ the world size. Communications statistics must be enabled (see
 
				 \anchor MPIPtpCommunication
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn int starpu_mpi_send (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
			
 
				+\fn int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Performs a standard-mode, blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm.
			
 
				 
			
 
				-\fn int starpu_mpi_recv (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
			
 
				+\fn int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
 
				 Performs a standard-mode, blocking receive in \p data_handle from the
			
 
				 node \p source using the message tag \p mpi_tag within the
			
 
				 communicator \p comm.
			
 
				 
			
 
				-\fn int starpu_mpi_isend (starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm)
			
 
				+\fn int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm. After the call, the pointer to the request \p req can be used to
			
 
				 test or to wait for the completion of the communication.
			
 
				 
			
 
				-\fn int starpu_mpi_irecv (starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm)
			
 
				+\fn int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm.
			
 
				 After the call, the pointer to the request \p req can be used to test
			
 
				 or to wait for the completion of the communication.
			
 
				 
			
 
				-\fn int starpu_mpi_isend_detached (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				+\fn int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
@@ -87,7 +87,7 @@ communication completes, its resources are automatically released back
 
				 to the system, there is no need to test or to wait for the completion
			
 
				 of the request.
			
 
				 
			
 
				-\fn int starpu_mpi_irecv_detached (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				+\fn int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm. On
			
@@ -98,34 +98,34 @@ communication completes, its resources are automatically released back
 
				 to the system, there is no need to test or to wait for the completion
			
 
				 of the request.
			
 
				 
			
 
				-\fn int starpu_mpi_wait (starpu_mpi_req *req, MPI_Status *status)
			
 
				+\fn int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
 
				 Returns when the operation identified by request \p req is complete.
			
 
				 
			
 
				-\fn int starpu_mpi_test (starpu_mpi_req *req, int *flag, MPI_Status *status)
			
 
				+\fn int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
 
				 If the operation identified by \p req is complete, set \p flag to 1.
			
 
				 The \p status object is set to contain information on the completed
			
 
				 operation.
			
 
				 
			
 
				-\fn int starpu_mpi_barrier (MPI_Comm comm)
			
 
				+\fn int starpu_mpi_barrier(MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Blocks the caller until all group members of the communicator \p comm
			
 
				 have called it.
			
 
				 
			
 
				-\fn int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				+\fn int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm. On completion, \p tag is unlocked.
			
 
				 
			
 
				-\fn int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				+\fn int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm. On
			
 
				 completion, \p tag is unlocked.
			
 
				 
			
 
				-\fn int starpu_mpi_isend_array_detached_unlock_tag (unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				+\fn int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts \p array_size standard-mode, non blocking send. Each post sends
			
 
				 the n-th data of the array \p data_handle to the n-th node of the
			
@@ -133,7 +133,7 @@ array \p dest using the n-th message tag of the array \p mpi_tag
 
				 within the n-th communicator of the array \p comm. On completion of
			
 
				 the all the requests, \p tag is unlocked.
			
 
				 
			
 
				-\fn int starpu_mpi_irecv_array_detached_unlock_tag (unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				+\fn int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts \p array_size nonblocking receive. Each post receives in the n-th
			
 
				 data of the array \p data_handle from the n-th node of the array \p
			
@@ -144,14 +144,14 @@ requests, \p tag is unlocked.
 
				 @name Communication Cache
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn void starpu_mpi_cache_flush (MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+\fn void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 \ingroup API_MPI_Support
			
 
				 Clear the send and receive communication cache for the data
			
 
				 \p data_handle. The function has to be called synchronously by all the
			
 
				 MPI nodes. The function does nothing if the cache mechanism is
			
 
				 disabled (see \ref STARPU_MPI_CACHE).
			
 
				 
			
 
				-\fn void starpu_mpi_cache_flush_all_data (MPI_Comm comm)
			
 
				+\fn void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Clear the send and receive communication cache for all data. The
			
 
				 function has to be called synchronously by all the MPI nodes. The
			
@@ -162,21 +162,21 @@ function does nothing if the cache mechanism is disabled (see
 
				 \anchor MPIInsertTask
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn int starpu_data_set_tag (starpu_data_handle_t handle, int tag)
			
 
				+\fn int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Tell StarPU-MPI which MPI tag to use when exchanging the data.
			
 
				 
			
 
				-\fn int starpu_data_get_tag (starpu_data_handle_t handle)
			
 
				+\fn int starpu_data_get_tag(starpu_data_handle_t handle)
			
 
				 \ingroup API_MPI_Support
			
 
				 Returns the MPI tag to be used when exchanging the data.
			
 
				 
			
 
				-\fn int starpu_data_set_rank (starpu_data_handle_t handle, int rank)
			
 
				+\fn int starpu_data_set_rank(starpu_data_handle_t handle, int rank)
			
 
				 \ingroup API_MPI_Support
			
 
				 Tell StarPU-MPI which MPI node "owns" a given data, that is, the node
			
 
				 which will always keep an up-to-date value, and will by default
			
 
				 execute tasks which write to it.
			
 
				 
			
 
				-\fn int starpu_data_get_rank (starpu_data_handle_t handle)
			
 
				+\fn int starpu_data_get_rank(starpu_data_handle_t handle)
			
 
				 \ingroup API_MPI_Support
			
 
				 Returns the last value set by starpu_data_set_rank().
			
 
				 
			
@@ -192,7 +192,7 @@ this macro is used when calling starpu_mpi_insert_task(), and must be
 
				 followed by a data handle to specify that the node owning the given
			
 
				 data will execute the codelet.
			
 
				 
			
 
				-\fn int starpu_mpi_insert_task (MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				+\fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				 \ingroup API_MPI_Support
			
 
				 Create and submit a task corresponding to codelet with the following
			
 
				 arguments. The argument list must be zero-terminated.
			
@@ -230,13 +230,13 @@ The algorithm also includes a communication cache mechanism that
 
				 allows not to send data twice to the same MPI node, unless the data
			
 
				 has been modified. The cache can be disabled (see \ref STARPU_MPI_CACHE).
			
 
				 
			
 
				-\fn void starpu_mpi_get_data_on_node (MPI_Comm comm, starpu_data_handle_t data_handle, int node)
			
 
				+\fn void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
			
 
				 \ingroup API_MPI_Support
			
 
				 Transfer data \p data_handle to MPI node \p node, sending it from its
			
 
				 owner if needed. At least the target node and the owner have to call
			
 
				 the function.
			
 
				 
			
 
				-\fn void starpu_mpi_get_data_on_node_detached (MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
			
 
				+\fn void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Transfer data \p data_handle to MPI node \p node, sending it from its
			
 
				 owner if needed. At least the target node and the owner have to call
			
@@ -247,12 +247,12 @@ the argument \p arg.
 
				 \anchor MPICollectiveOperations
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn void starpu_mpi_redux_data (MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+\fn void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 \ingroup API_MPI_Support
			
 
				 Perform a reduction on the given data. All nodes send the data to its
			
 
				 owner node which will perform a reduction.
			
 
				 
			
 
				-\fn int starpu_mpi_scatter_detached (starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				+\fn int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Scatter data among processes of the communicator based on the
			
 
				 ownership of the data. For each data of the array \p data_handles, the
			
@@ -263,7 +263,7 @@ called with the argument \p sarg on the process \p root, the \p
 
				 rcallback function is called with the argument \p rarg on any other
			
 
				 process.
			
 
				 
			
 
				-\fn int starpu_mpi_gather_detached (starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				+\fn int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Gather data from the different processes of the communicator onto the
			
 
				 process \p root. Each process owning data handle in the array
			
--- a/doc/doxygen/chapters/api/opencl_extensions.doxy
+++ b/doc/doxygen/chapters/api/opencl_extensions.doxy
@@ -82,11 +82,11 @@ starpu_opencl_program array by hand for more advanced use (e.g.
 
				 different programs on the different OpenCL devices, for relocation
			
 
				 purpose for instance).
			
 
				 
			
 
				-\fn int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options)
			
 
				+\fn int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 This function compiles an OpenCL source code stored in a file.
			
 
				 
			
 
				-\fn int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char* build_options)
			
 
				+\fn int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 This function compiles an OpenCL source code stored in a string.
			
 
				 
			
@@ -107,7 +107,7 @@ has been located on the system, \p located_dir_name the directory
 
				 where it has been located. Otherwise, they are both set to the empty
			
 
				 string.
			
 
				 
			
 
				-\fn int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char * build_options)
			
 
				+\fn int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Compile the OpenCL kernel stored in the file \p source_file_name
			
 
				 with the given options \p build_options and stores the result in the
			
@@ -116,7 +116,7 @@ directory <c>$STARPU_HOME/.starpu/opencl</c> with the same filename as
 
				 and the filename is suffixed with the vendor id and the device id of
			
 
				 the OpenCL device.
			
 
				 
			
 
				-\fn int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char*build_options)
			
 
				+\fn int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Compile the OpenCL kernel in the string \p opencl_program_source
			
 
				 with the given options \p build_options and stores the result in the
			
@@ -158,7 +158,7 @@ consumed power).
 
				 @name OpenCL utilities
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 
			
 
				-\fn const char * starpu_opencl_error_string(cl_int status)
			
 
				+\fn const char *starpu_opencl_error_string(cl_int status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Return the error message in English corresponding to \p status, an OpenCL
			
 
				 error code.
			
--- a/doc/doxygen/chapters/api/parallel_tasks.doxy
+++ b/doc/doxygen/chapters/api/parallel_tasks.doxy
@@ -42,7 +42,7 @@ Get the description of a combined worker
 
				 Variant of starpu_worker_can_execute_task() compatible with combined
			
 
				 workers
			
 
				 
			
 
				-\fn void starpu_parallel_task_barrier_init(struct starpu_task*task, int workerid)
			
 
				+\fn void starpu_parallel_task_barrier_init(struct starpu_task *task, int workerid)
			
 
				 \ingroup API_Parallel_Tasks
			
 
				 Initialise the barrier for the parallel task, and dispatch the task
			
 
				 between the different combined workers.
			
--- a/doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
+++ b/doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
@@ -114,7 +114,7 @@ performance counters to StarPU. By incrementing them, StarPU can help
 
				 the hypervisor in the resizing decision making process. TODO maybe
			
 
				 they should be hidden to the user
			
 
				 
			
 
				-\fn struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy * policy)
			
 
				+\fn struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy *policy)
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				 Initializes the hypervisor to use the strategy provided as parameter
			
 
				 and creates the performance counters (see starpu_sched_ctx_performance_counters).
			
@@ -148,7 +148,7 @@ flops the context will execute (needed for Gflops rate based strategy
 
				 see \ref ResizingStrategies or any other custom strategy needing it, for
			
 
				 the others we can pass 0.0)
			
 
				 
			
 
				-\fn void sc_hypervisor_unregister_ctx (unsigned sched_ctx)
			
 
				+\fn void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				 Unregister the context from the hypervisor.
			
 
				 
			
@@ -268,11 +268,11 @@ struct sc_hypervisor_policy dummy_policy =
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Moves workers from one context to another
			
 
				 
			
 
				-\fn struct sc_hypervisor_policy_config * sc_hypervisor_get_config(unsigned sched_ctx);
			
 
				+\fn struct sc_hypervisor_policy_config *sc_hypervisor_get_config(unsigned sched_ctx);
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Returns the configuration structure of a context
			
 
				 
			
 
				-\fn int * sc_hypervisor_get_sched_ctxs();
			
 
				+\fn int *sc_hypervisor_get_sched_ctxs();
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Gets the contexts managed by the hypervisor
			
 
				 
			
@@ -280,15 +280,15 @@ struct sc_hypervisor_policy dummy_policy =
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Gets the number of contexts managed by the hypervisor
			
 
				 
			
 
				-\fn struct sc_hypervisor_wrapper * sc_hypervisor_get_wrapper(unsigned sched_ctx);
			
 
				+\fn struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Returns the wrapper corresponding the context \p sched_ctx
			
 
				 
			
 
				-\fn double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper * sc_w);
			
 
				+\fn double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Returns the flops of a context elapsed from the last resize
			
 
				 
			
 
				-\fn char * sc_hypervisor_get_policy();
			
 
				+\fn char *sc_hypervisor_get_policy();
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Returns the name of the resizing policy the hypervisor uses
			
 
				 
			
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -212,7 +212,7 @@ policy of the given scheduler context.
 
				 @name Scheduling Context Worker Collection
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 
			
 
				-\fn struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type)
			
 
				+\fn struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Create a worker collection of the type indicated by the last parameter
			
 
				 for the context specified through the first parameter.
			
@@ -221,7 +221,7 @@ for the context specified through the first parameter.
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Delete the worker collection of the specified scheduling context
			
 
				 
			
 
				-\fn struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id)
			
 
				+\fn struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Return the worker collection managed by the indicated context
			
 
				 
			
--- a/doc/doxygen/chapters/api/scheduling_policy.doxy
+++ b/doc/doxygen/chapters/api/scheduling_policy.doxy
@@ -58,7 +58,7 @@ starpu_init().
 
				 \var starpu_sched_policy::policy_description
			
 
				         Optional field. Human readable description of the policy.
			
 
				 
			
 
				-\fn struct starpu_sched_policy ** starpu_sched_get_predefined_policies()
			
 
				+\fn struct starpu_sched_policy **starpu_sched_get_predefined_policies()
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 Return an NULL-terminated array of all the predefined scheduling
			
 
				 policies.
			
@@ -73,13 +73,13 @@ condition variable. For instance, in the case of a scheduling strategy
 
				 with a single task queue, the same condition variable would be used to
			
 
				 block and wake up all workers.
			
 
				 
			
 
				-\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void * policy_data)
			
 
				+\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 Each scheduling policy uses some specific data (queues, variables,
			
 
				 additional condition variables). It is memorize through a local
			
 
				 structure. This function assigns it to a scheduling context.
			
 
				 
			
 
				-\fn void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
			
 
				+\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 Returns the policy data previously assigned to a context
			
 
				 
			
@@ -135,7 +135,7 @@ otherwise the task may fail to execute.
 
				 \ingroup API_Scheduling_Policy
			
 
				 Return the current date in micro-seconds.
			
 
				 
			
 
				-\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task * task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 Returns the footprint for a given task
			
 
				 
			
--- a/doc/doxygen/chapters/api/task_bundles.doxy
+++ b/doc/doxygen/chapters/api/task_bundles.doxy
@@ -15,12 +15,12 @@ on the same worker whenever it’s possible. It must be considered as a
 
				 hint given to the scheduler as there is no guarantee that they will be
			
 
				 executed on the same worker.
			
 
				 
			
 
				-\fn void starpu_task_bundle_create (starpu_task_bundle_t *bundle)
			
 
				+\fn void starpu_task_bundle_create(starpu_task_bundle_t *bundle)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Factory function creating and initializing \p bundle, when the call
			
 
				 returns, memory needed is allocated and \p bundle is ready to use.
			
 
				 
			
 
				-\fn int starpu_task_bundle_insert (starpu_task_bundle_t bundle, struct starpu_task *task)
			
 
				+\fn int starpu_task_bundle_insert(starpu_task_bundle_t bundle, struct starpu_task *task)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Insert \p task in \p bundle. Until \p task is removed from \p bundle
			
 
				 its expected length and data transfer time will be considered along
			
@@ -30,7 +30,7 @@ On success, it returns 0. There are two cases of error : if \p bundle
 
				 is already closed it returns <c>-EPERM</c>, if \p task was already
			
 
				 submitted it returns <c>-EINVAL</c>.
			
 
				 
			
 
				-\fn int starpu_task_bundle_remove (starpu_task_bundle_t bundle, struct starpu_task *task)
			
 
				+\fn int starpu_task_bundle_remove(starpu_task_bundle_t bundle, struct starpu_task *task)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Remove \p task from \p bundle. Of course \p task must have been
			
 
				 previously inserted in \p bundle. This function must not be called if
			
@@ -38,21 +38,21 @@ previously inserted in \p bundle. This function must not be called if
 
				 so would result in undefined behaviour. On success, it returns 0. If
			
 
				 \p bundle is already closed it returns <c>-ENOENT</c>.
			
 
				 
			
 
				-\fn void starpu_task_bundle_close (starpu_task_bundle_t bundle)
			
 
				+\fn void starpu_task_bundle_close(starpu_task_bundle_t bundle)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Inform the runtime that the user will not modify \p bundle anymore, it
			
 
				 means no more inserting or removing task. Thus the runtime can destroy
			
 
				 it when possible.
			
 
				 
			
 
				-\fn double starpu_task_bundle_expected_length (starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+\fn double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Return the expected duration of \p bundle in micro-seconds.
			
 
				 
			
 
				-\fn double starpu_task_bundle_expected_power (starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+\fn double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Return the expected power consumption of \p bundle in J.
			
 
				 
			
 
				-\fn double starpu_task_bundle_expected_data_transfer_time (starpu_task_bundle_t bundle, unsigned memory_node)
			
 
				+\fn double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Return the time (in micro-seconds) expected to transfer all data used within \p bundle.
			
 
				 
			
--- a/doc/doxygen/chapters/api/task_lists.doxy
+++ b/doc/doxygen/chapters/api/task_lists.doxy
@@ -28,11 +28,11 @@ Push \p task at the front of \p list
 
				 \ingroup API_Task_Lists
			
 
				 Push \p task at the back of \p list
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_front(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the front of \p list (without removing it)
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_back(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the back of \p list (without removing it)
			
 
				 
			
@@ -44,23 +44,23 @@ Test if \p list is empty
 
				 \ingroup API_Task_Lists
			
 
				 Remove \p task from \p list
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_pop_front(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Remove the element at the front of \p list
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_pop_back(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Remove the element at the back of \p list
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_begin(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the first task of \p list.
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_end(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_end(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the end of \p list.
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_next(struct starpu_task *task)
			
 
				+\fn struct starpu_task *starpu_task_list_next(struct starpu_task *task)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the next task of \p list. This is not erase-safe.
			
 
				 
			
--- a/doc/doxygen/chapters/api/top.doxy
+++ b/doc/doxygen/chapters/api/top.doxy
@@ -9,62 +9,62 @@
 
				 /*! \defgroup API_StarPUTop_Interface StarPU-Top Interface
			
 
				 
			
 
				 \enum starpu_top_data_type
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Data type
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_BOOLEAN
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_INTEGER
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_FLOAT
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 
			
 
				 \enum starpu_top_param_type
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Parameter type
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_BOOLEAN
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_INTEGER
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_FLOAT
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_ENUM
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 
			
 
				 \enum starpu_top_message_type
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Message type
			
 
				 \var starpu_top_message_type::TOP_TYPE_GO
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_SET
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_CONTINUE
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_ENABLE
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_DISABLE
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_DEBUG
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_UNKNOW
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 
			
 
				 \struct starpu_top_data
			
 
				 todo
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 \var starpu_top_data::id
			
 
				 todo
			
 
				 \var starpu_top_data::name
			
@@ -86,7 +86,7 @@ todo
 
				 
			
 
				 \struct starpu_top_param
			
 
				 todo
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 \var starpu_top_param::id
			
 
				 todo
			
 
				 \var starpu_top_param::name
			
@@ -113,98 +113,98 @@ todo
 
				 todo
			
 
				 
			
 
				 @name Functions to call before the initialisation
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 
			
 
				-\fn struct starpu_top_data *starpu_top_add_data_boolean(const char* data_name, int active)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name, int active)
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a data named data_name of type boolean.
			
 
				 If \p active=0, the value will NOT be displayed to user by default.
			
 
				 Any other value will make the value displayed by default.
			
 
				 
			
 
				-\fn struct starpu_top_data * starpu_top_add_data_integer(const char* data_name, int minimum_value, int maximum_value, int active)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_data *starpu_top_add_data_integer(const char *data_name, int minimum_value, int maximum_value, int active)
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a data named \p data_name of type integer. The
			
 
				 minimum and maximum value will be usefull to define the scale in UI.
			
 
				 If \p active=0, the value will NOT be displayed to user by default.
			
 
				 Any other value will make the value displayed by default.
			
 
				 
			
 
				-\fn struct starpu_top_data* starpu_top_add_data_float(const char* data_name, double minimum_value, double maximum_value, int active)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_data *starpu_top_add_data_float(const char *data_name, double minimum_value, double maximum_value, int active)
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a data named data_name of type float. The
			
 
				 minimum and maximum value will be usefull to define the scale in UI.
			
 
				 If \p active=0, the value will NOT be displayed to user by default.
			
 
				 Any other value will make the value displayed by default.
			
 
				 
			
 
				-\fn struct starpu_top_param* starpu_top_register_parameter_boolean(const char* param_name, int* parameter_field, void (*callback)(struct starpu_top_param*))
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name, int *parameter_field, void (*callback)(struct starpu_top_param*))
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a parameter named \p parameter_name, of type
			
 
				 boolean. The \p callback fonction will be called when the parameter is
			
 
				 modified by UI, and can be null.
			
 
				 
			
 
				-\fn struct starpu_top_param* starpu_top_register_parameter_float(const char* param_name, double* parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name, double *parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 his fonction register a parameter named \p param_name, of type
			
 
				 integer. Minimum and maximum value will be used to prevent user seting
			
 
				 incorrect value. The \p callback fonction will be called when the
			
 
				 parameter is modified by UI, and can be null.
			
 
				 
			
 
				-\fn struct starpu_top_param* starpu_top_register_parameter_integer(const char* param_name, int* parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name, int *parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a parameter named \p param_name, of type float.
			
 
				 Minimum and maximum value will be used to prevent user seting
			
 
				 incorrect value. The \p callback fonction will be called when the
			
 
				 parameter is modified by UI, and can be null.
			
 
				 
			
 
				-\fn struct starpu_top_param* starpu_top_register_parameter_enum(const char* param_name, int* parameter_field, char** values, int nb_values, void (*callback)(struct starpu_top_param*))
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name, int *parameter_field, char **values, int nb_values, void (*callback)(struct starpu_top_param*))
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a parameter named \p param_name, of type enum.
			
 
				 Minimum and maximum value will be used to prevent user seting
			
 
				 incorrect value. The \p callback fonction will be called when the
			
 
				 parameter is modified by UI, and can be null.
			
 
				 
			
 
				 @name Initialisation
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 
			
 
				 \fn void starpu_top_init_and_wait(const char *server_name)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function must be called when all parameters and data have been
			
 
				 registered AND initialised (for parameters). This function will wait
			
 
				 for a TOP to connect, send initialisation sentences, and wait for the
			
 
				 GO message.
			
 
				 
			
 
				 @name To call after initialisation
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 
			
 
				 \fn void starpu_top_update_parameter(const struct starpu_top_param *param)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function should be called after every modification of a parameter
			
 
				 from something other than starpu_top. This fonction notice UI that the
			
 
				 configuration changed.
			
 
				 
			
 
				 \fn void starpu_top_update_data_boolean(const struct starpu_top_data *data, int value)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function updates the value of the starpu_top_data on UI.
			
 
				 
			
 
				 \fn void starpu_top_update_data_integer(const struct starpu_top_data *data, int value)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function updates the value of the starpu_top_data on UI.
			
 
				 
			
 
				 \fn void starpu_top_update_data_float(const struct starpu_top_data *data, double value)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function updates the value of the starpu_top_data on UI.
			
 
				 
			
 
				 \fn void starpu_top_task_prevision(struct starpu_task *task, int devid, unsigned long long start, unsigned long long end)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function notifies UI than the task have been planed to run from start to end, on computation-core.
			
 
				 
			
 
				 \fn void starpu_top_debug_log(const char *message)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function is useful in debug mode. The starpu developper doesn't
			
 
				 need to check if the debug mode is active. This is checked by
			
 
				 starpu_top itsefl. It just send a message to display by UI.
			
 
				 
			
 
				 \fn void starpu_top_debug_lock(const char *message)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function is useful in debug mode. The starpu developper doesn't
			
 
				 need to check if the debug mode is active. This is checked by
			
 
				 starpu_top itsefl. It send a message and wait for a continue message
			
--- a/doc/doxygen/chapters/api/workers.doxy
+++ b/doc/doxygen/chapters/api/workers.doxy
@@ -99,7 +99,7 @@ The returned value should be at most \ref STARPU_MAXSCCDEVS.
 
				 This function returns the number of OpenCL devices controlled by
			
 
				 StarPU. The returned value should be at most \ref STARPU_MAXOPENCLDEVS.
			
 
				 
			
 
				-\fn int starpu_worker_get_id (void)
			
 
				+\fn int starpu_worker_get_id(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				 This function returns the identifier of the current worker, i.e
			
 
				 the one associated to the calling thread. The returned value is either
			
--- a/doc/doxygen/chapters/basic_examples.doxy
+++ b/doc/doxygen/chapters/basic_examples.doxy
@@ -12,7 +12,7 @@
 
				 
			
 
				 This section shows how to implement a simple program that submits a task
			
 
				 to StarPU using the StarPU C extension (\ref cExtensions). The complete example, and additional examples,
			
 
				-is available in the <c>gcc-plugin/examples</c> directory of the StarPU
			
 
				+is available in the directory <c>gcc-plugin/examples</c> of the StarPU
			
 
				 distribution. A similar example showing how to directly use the StarPU's API is shown
			
 
				 in \ref HelloWorldUsingStarPUAPI.
			
 
				 
			
@@ -24,7 +24,7 @@ has a single implementation for CPU:
 
				 
			
 
				 \snippet hello_pragma.c To be included
			
 
				 
			
 
				-The code can then be compiled and linked with GCC and the <c>-fplugin</c> flag:
			
 
				+The code can then be compiled and linked with GCC and the flag <c>-fplugin</c>:
			
 
				 
			
 
				 \verbatim
			
 
				 $ gcc `pkg-config starpu-1.2 --cflags` hello-starpu.c \
			
@@ -92,9 +92,9 @@ compiler implicitly do it as examplified above.
 
				 The field starpu_codelet::nbuffers specifies the number of data buffers that are
			
 
				 manipulated by the codelet: here the codelet does not access or modify any data
			
 
				 that is controlled by our data management library. Note that the argument
			
 
				-passed to the codelet (the field starpu_task::cl_arg) does not count
			
 
				-as a buffer since it is not managed by our data management library,
			
 
				-but just contain trivial parameters.
			
 
				+passed to the codelet (the parameter <c>cl_arg</c> of the function
			
 
				+<c>cpu_func</c>) does not count as a buffer since it is not managed by
			
 
				+our data management library, but just contain trivial parameters.
			
 
				 
			
 
				 \internal
			
 
				 TODO need a crossref to the proper description of "where" see bla for more ...
			
@@ -168,7 +168,7 @@ int main(int argc, char **argv)
 
				 \endcode
			
 
				 
			
 
				 Before submitting any tasks to StarPU, starpu_init() must be called. The
			
 
				-<c>NULL</c> argument specifies that we use default configuration. Tasks cannot
			
 
				+<c>NULL</c> argument specifies that we use the default configuration. Tasks cannot
			
 
				 be submitted after the termination of StarPU by a call to
			
 
				 starpu_shutdown().
			
 
				 
			
@@ -194,12 +194,13 @@ computational kernel that multiplies its input vector by a constant,
 
				 the constant could be specified by the means of this buffer, instead
			
 
				 of registering it as a StarPU data. It must however be noted that
			
 
				 StarPU avoids making copy whenever possible and rather passes the
			
 
				-pointer as such, so the buffer which is pointed at must kept allocated
			
 
				+pointer as such, so the buffer which is pointed at must be kept allocated
			
 
				 until the task terminates, and if several tasks are submitted with
			
 
				 various parameters, each of them must be given a pointer to their
			
 
				-buffer.	
			
 
				+own buffer.
			
 
				 
			
 
				-Once a task has been executed, an optional callback function is be called.
			
 
				+Once a task has been executed, an optional callback function
			
 
				+starpu_task::callback_func is called when defined.
			
 
				 While the computational kernel could be offloaded on various architectures, the
			
 
				 callback function is always executed on a CPU. The pointer
			
 
				 starpu_task::callback_arg is passed as an argument of the callback
			
@@ -211,7 +212,7 @@ void (*callback_function)(void *);
 
				 
			
 
				 If the field starpu_task::synchronous is non-zero, task submission
			
 
				 will be synchronous: the function starpu_task_submit() will not return
			
 
				-until the task was executed. Note that the function starpu_shutdown()
			
 
				+until the task has been executed. Note that the function starpu_shutdown()
			
 
				 does not guarantee that asynchronous tasks have been executed before
			
 
				 it returns, starpu_task_wait_for_all() can be used to that effect, or
			
 
				 data can be unregistered (starpu_data_unregister()), which will
			
@@ -237,12 +238,12 @@ we show how StarPU tasks can manipulate data.
 
				 
			
 
				 We will first show how to use the C language extensions provided by
			
 
				 the GCC plug-in (\ref cExtensions). The complete example, and
			
 
				-additional examples, is available in the <c>gcc-plugin/examples</c>
			
 
				-directory of the StarPU distribution. These extensions map directly
			
 
				+additional examples, is available in the directory <c>gcc-plugin/examples</c>
			
 
				+of the StarPU distribution. These extensions map directly
			
 
				 to StarPU's main concepts: tasks, task implementations for CPU,
			
 
				 OpenCL, or CUDA, and registered data buffers. The standard C version
			
 
				-that uses StarPU's standard C programming interface is given in the
			
 
				-next section (\ref VectorScalingUsingStarPUAPI).
			
 
				+that uses StarPU's standard C programming interface is given in \ref
			
 
				+VectorScalingUsingStarPUAPI.
			
 
				 
			
 
				 First of all, the vector-scaling task and its simple CPU implementation
			
 
				 has to be defined:
			
@@ -268,7 +269,7 @@ implemented:
 
				 
			
 
				 \snippet hello_pragma2.c To be included
			
 
				 
			
 
				-The <c>main</c> function above does several things:
			
 
				+The function <c>main</c> above does several things:
			
 
				 
			
 
				 <ul>
			
 
				 <li>
			
@@ -287,22 +288,20 @@ StarPU to transfer that memory region between GPUs and the main memory.
 
				 Removing this <c>pragma</c> is an error.
			
 
				 </li>
			
 
				 <li>
			
 
				-It invokes the <c>vector_scal</c> task.  The invocation looks the same
			
 
				+It invokes the task <c>vector_scal</c>.  The invocation looks the same
			
 
				 as a standard C function call.  However, it is an asynchronous
			
 
				 invocation, meaning that the actual call is performed in parallel with
			
 
				 the caller's continuation.
			
 
				 </li>
			
 
				 <li>
			
 
				-It waits for the termination of the <c>vector_scal</c>
			
 
				-asynchronous call.
			
 
				+It waits for the termination of the asynchronous call <c>vector_scal</c>.
			
 
				 </li>
			
 
				 <li>
			
 
				 Finally, StarPU is shut down.
			
 
				 </li>
			
 
				 </ul>
			
 
				 
			
 
				-The program can be compiled and linked with GCC and the <c>-fplugin</c>
			
 
				-flag:
			
 
				+The program can be compiled and linked with GCC and the flag <c>-fplugin</c>:
			
 
				 
			
 
				 \verbatim
			
 
				 $ gcc `pkg-config starpu-1.2 --cflags` vector_scal.c \
			
@@ -317,7 +316,7 @@ And voilà!
 
				 Now, this is all fine and great, but you certainly want to take
			
 
				 advantage of these newfangled GPUs that your lab just bought, don't you?
			
 
				 
			
 
				-So, let's add an OpenCL implementation of the <c>vector_scal</c> task.
			
 
				+So, let's add an OpenCL implementation of the task <c>vector_scal</c>.
			
 
				 We assume that the OpenCL kernel is available in a file,
			
 
				 <c>vector_scal_opencl_kernel.cl</c>, not shown here.  The OpenCL task
			
 
				 implementation is similar to that used with the standard C API
			
@@ -374,14 +373,14 @@ vector_scal_opencl (unsigned size, float vector[size], float factor)
 
				 \endcode
			
 
				 
			
 
				 The OpenCL kernel itself must be loaded from <c>main</c>, sometime after
			
 
				-the <c>initialize</c> pragma:
			
 
				+the pragma <c>initialize</c>:
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
			
 
				                                        &cl_programs, "");
			
 
				 \endcode
			
 
				 
			
 
				-And that's it.  The <c>vector_scal</c> task now has an additional
			
 
				+And that's it.  The task <c>vector_scal</c> now has an additional
			
 
				 implementation, for OpenCL, which StarPU's scheduler may choose to use
			
 
				 at run-time.  Unfortunately, the <c>vector_scal_opencl</c> above still
			
 
				 has to go through the common OpenCL boilerplate; in the future,
			
@@ -404,40 +403,13 @@ The actual implementation of the CUDA task goes into a separate
 
				 compilation unit, in a <c>.cu</c> file.  It is very close to the
			
 
				 implementation when using StarPU's standard C API (\ref DefinitionOfTheCUDAKernel).
			
 
				 
			
 
				-\code{.c}
			
 
				-/* CUDA implementation of the `vector_scal' task, to be compiled with `nvcc'. */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <stdlib.h>
			
 
				-
			
 
				-static __global__ void
			
 
				-vector_mult_cuda (unsigned n, float *val, float factor)
			
 
				-{
			
 
				-  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				-
			
 
				-  if (i < n)
			
 
				-    val[i] *= factor;
			
 
				-}
			
 
				-
			
 
				-/* Definition of the task implementation declared in the C file. */
			
 
				-extern "C" void
			
 
				-vector_scal_cuda (size_t size, float vector[], float factor)
			
 
				-{
			
 
				-  unsigned threads_per_block = 64;
			
 
				-  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
			
 
				-
			
 
				-  vector_mult_cuda <<< nblocks, threads_per_block, 0,
			
 
				-    starpu_cuda_get_local_stream () >>> (size, vector, factor);
			
 
				+\snippet scal_pragma.cu To be included
			
 
				 
			
 
				-  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				-}
			
 
				-\endcode
			
 
				-
			
 
				-The complete source code, in the <c>gcc-plugin/examples/vector_scal</c>
			
 
				-directory of the StarPU distribution, also shows how an SSE-specialized
			
 
				+The complete source code, in the directory <c>gcc-plugin/examples/vector_scal</c>
			
 
				+of the StarPU distribution, also shows how an SSE-specialized
			
 
				 CPU task implementation can be added.
			
 
				 
			
 
				-For more details on the C extensions provided by StarPU's GCC plug-in,
			
 
				+For more details on the C extensions provided by StarPU's GCC plug-in, see
			
 
				 \ref cExtensions.
			
 
				 
			
 
				 \section VectorScalingUsingStarPUAPI Vector Scaling Using StarPU's API
			
@@ -479,7 +451,7 @@ starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
 
				 The first argument, called the <b>data handle</b>, is an opaque pointer which
			
 
				 designates the array in StarPU. This is also the structure which is used to
			
 
				 describe which data is used by a task. The second argument is the node number
			
 
				-where the data originally resides. Here it is 0 since the <c>vector array</c> is in
			
 
				+where the data originally resides. Here it is 0 since the array <c>vector</c> is in
			
 
				 the main memory. Then comes the pointer <c>vector</c> where the data can be found in main memory,
			
 
				 the number of elements in the vector and the size of each element.
			
 
				 The following shows how to construct a StarPU task that will manipulate the
			
@@ -569,36 +541,9 @@ The CUDA implementation can be written as follows. It needs to be compiled with
 
				 a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
			
 
				 that the vector pointer returned by ::STARPU_VECTOR_GET_PTR is here a
			
 
				 pointer in GPU memory, so that it can be passed as such to the
			
 
				-<c>vector_mult_cuda</c> kernel call.
			
 
				-
			
 
				-\code{.c}
			
 
				-#include <starpu.h>
			
 
				+kernel call <c>vector_mult_cuda</c>.
			
 
				 
			
 
				-static __global__ void vector_mult_cuda(unsigned n, float *val,
			
 
				-                                        float factor)
			
 
				-{
			
 
				-    unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				-    if (i < n)
			
 
				-        val[i] *= factor;
			
 
				-}
			
 
				-
			
 
				-extern "C" void scal_cuda_func(void *buffers[], void *_args)
			
 
				-{
			
 
				-    float *factor = (float *)_args;
			
 
				-
			
 
				-    /* length of the vector */
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    /* CUDA copy of the vector pointer */
			
 
				-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-    unsigned threads_per_block = 64;
			
 
				-    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				-
			
 
				-    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
			
 
				-                    (n, val, *factor);
			
 
				-
			
 
				-    cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-}
			
 
				-\endcode
			
 
				+\snippet vector_scal_cuda.cu To be included
			
 
				 
			
 
				 \subsection DefinitionOfTheOpenCLKernel Definition of the OpenCL Kernel
			
 
				 
			
@@ -620,55 +565,7 @@ which returns a <c>cl_mem</c> (which is not a device pointer, but an OpenCL
 
				 handle), which can be passed as such to the OpenCL kernel. The difference is
			
 
				 important when using partitioning, see \ref PartitioningData.
			
 
				 
			
 
				-\code{.c}
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-extern struct starpu_opencl_program programs;
			
 
				-
			
 
				-void scal_opencl_func(void *buffers[], void *_args)
			
 
				-{
			
 
				-    float *factor = _args;
			
 
				-    int id, devid, err;     /* OpenCL specific code */
			
 
				-    cl_kernel kernel;       /* OpenCL specific code */
			
 
				-    cl_command_queue queue; /* OpenCL specific code */
			
 
				-    cl_event event;         /* OpenCL specific code */
			
 
				-
			
 
				-    /* length of the vector */
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    /* OpenCL copy of the vector pointer */
			
 
				-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				-
			
 
				-    { /* OpenCL specific code */
			
 
				-        id = starpu_worker_get_id();
			
 
				-        devid = starpu_worker_get_devid(id);
			
 
				-
			
 
				-	err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
			
 
				-	                       "vector_mult_opencl", devid);   /* Name of the codelet defined above */
			
 
				-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-        err = clSetKernelArg(kernel, 0, sizeof(n), &n);
			
 
				-        err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
			
 
				-        err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				-        if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-    }
			
 
				-
			
 
				-    {  /* OpenCL specific code */
			
 
				-        size_t global=n;
			
 
				-        size_t local=1;
			
 
				-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-    }
			
 
				-
			
 
				-    {  /* OpenCL specific code */
			
 
				-        clFinish(queue);
			
 
				-        starpu_opencl_collect_stats(event);
			
 
				-        clReleaseEvent(event);
			
 
				-
			
 
				-        starpu_opencl_release_kernel(kernel);
			
 
				-    }
			
 
				-}
			
 
				-\endcode
			
 
				-
			
 
				+\snippet vector_scal_opencl.c To be included
			
 
				 
			
 
				 \subsection DefinitionOfTheMainCode Definition of the Main Code
			
 
				 
			
--- a/doc/doxygen/chapters/building.doxy
+++ b/doc/doxygen/chapters/building.doxy
@@ -134,8 +134,8 @@ $ make install
 
				 \endverbatim
			
 
				 
			
 
				 Libtool interface versioning information are included in
			
 
				-libraries names (libstarpu-1.2.so, libstarpumpi-1.2.so and
			
 
				-libstarpufft-1.2.so).
			
 
				+libraries names (<c>libstarpu-1.2.so</c>, <c>libstarpumpi-1.2.so</c> and
			
 
				+<c>libstarpufft-1.2.so</c>).
			
 
				 
			
 
				 \section SettingUpYourOwnCode Setting up Your Own Code
			
 
				 
			
@@ -145,10 +145,10 @@ StarPU provides a pkg-config executable to obtain relevant compiler
 
				 and linker flags.
			
 
				 Compiling and linking an application against StarPU may require to use
			
 
				 specific flags or libraries (for instance <c>CUDA</c> or <c>libspe2</c>).
			
 
				-To this end, it is possible to use the <c>pkg-config</c> tool.
			
 
				+To this end, it is possible to use the tool <c>pkg-config</c>.
			
 
				 
			
 
				 If StarPU was not installed at some standard location, the path of StarPU's
			
 
				-library must be specified in the <c>PKG_CONFIG_PATH</c> environment variable so
			
 
				+library must be specified in the environment variable <c>PKG_CONFIG_PATH</c> so
			
 
				 that <c>pkg-config</c> can find it. For example if StarPU was installed in
			
 
				 <c>$prefix_dir</c>:
			
 
				 
			
@@ -175,10 +175,10 @@ Make sure that <c>pkg-config --libs starpu-1.2</c> actually produces some output
 
				 before going further: <c>PKG_CONFIG_PATH</c> has to point to the place where
			
 
				 <c>starpu-1.2.pc</c> was installed during <c>make install</c>.
			
 
				 
			
 
				-Also pass the <c>--static</c> option if the application is to be
			
 
				+Also pass the option <c>--static</c> if the application is to be
			
 
				 linked statically.
			
 
				 
			
 
				-It is also necessary to set the variable <c>LD_LIBRARY_PATH</c> to
			
 
				+It is also necessary to set the environment variable <c>LD_LIBRARY_PATH</c> to
			
 
				 locate dynamic libraries at runtime.
			
 
				 
			
 
				 \verbatim
			
@@ -283,10 +283,10 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 
				 
			
 
				 \subsection CholeskyFactorization Cholesky Factorization
			
 
				 
			
 
				-<c>cholesky\*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
			
 
				+<c>cholesky/*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
			
 
				 
			
 
				 \subsection LUFactorization LU Factorization
			
 
				 
			
 
				-<c>lu\*</c> perform an LU factorization. They use different dependency primitives.
			
 
				+<c>lu/*</c> perform an LU factorization. They use different dependency primitives.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/code/scal_pragma.cu
+++ b/doc/doxygen/chapters/code/scal_pragma.cu
@@ -0,0 +1,45 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+//! [To be included]
			
 
				+/* CUDA implementation of the `vector_scal' task, to be compiled with `nvcc'. */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+static __global__ void
			
 
				+vector_mult_cuda (unsigned n, float *val, float factor)
			
 
				+{
			
 
				+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+
			
 
				+  if (i < n)
			
 
				+    val[i] *= factor;
			
 
				+}
			
 
				+
			
 
				+/* Definition of the task implementation declared in the C file. */
			
 
				+extern "C" void
			
 
				+vector_scal_cuda (size_t size, float vector[], float factor)
			
 
				+{
			
 
				+  unsigned threads_per_block = 64;
			
 
				+  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
			
 
				+
			
 
				+  vector_mult_cuda <<< nblocks, threads_per_block, 0,
			
 
				+    starpu_cuda_get_local_stream () >>> (size, vector, factor);
			
 
				+
			
 
				+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				+}
			
 
				+//! [To be included]
			
--- a/doc/doxygen/chapters/code/vector_scal_opencl.c
+++ b/doc/doxygen/chapters/code/vector_scal_opencl.c
@@ -24,29 +24,33 @@ extern struct starpu_opencl_program programs;
 
				 void scal_opencl_func(void *buffers[], void *_args)
			
 
				 {
			
 
				     float *factor = _args;
			
 
				-    int id, devid, err;
			
 
				-    cl_kernel kernel;
			
 
				-    cl_command_queue queue;
			
 
				-    cl_event event;
			
 
				+    int id, devid, err;                   /* OpenCL specific code */
			
 
				+    cl_kernel kernel;                     /* OpenCL specific code */
			
 
				+    cl_command_queue queue;               /* OpenCL specific code */
			
 
				+    cl_event event;                       /* OpenCL specific code */
			
 
				 
			
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				     /* OpenCL copy of the vector pointer */
			
 
				     cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				 
			
 
				-    id = starpu_worker_get_id();
			
 
				-    devid = starpu_worker_get_devid(id);
			
 
				+    {  /* OpenCL specific code */
			
 
				+	 id = starpu_worker_get_id();
			
 
				+	 devid = starpu_worker_get_devid(id);
			
 
				 
			
 
				-    err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl",
			
 
				-                                    devid);
			
 
				-    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	 err = starpu_opencl_load_kernel(&kernel, &queue,
			
 
				+					 &programs,
			
 
				+					 "vector_mult_opencl", /* Name of the codelet */
			
 
				+					 devid);
			
 
				+	 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-    err = clSetKernelArg(kernel, 0, sizeof(n), &n);
			
 
				-    err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
			
 
				-    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				-    if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	 err = clSetKernelArg(kernel, 0, sizeof(n), &n);
			
 
				+	 err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
			
 
				+	 err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				+	 if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+    }
			
 
				 
			
 
				-    {
			
 
				+    {   /* OpenCL specific code */
			
 
				         size_t global=n;
			
 
				         size_t local;
			
 
				         size_t s;
			
@@ -63,10 +67,12 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				     }
			
 
				 
			
 
				-    clFinish(queue);
			
 
				-    starpu_opencl_collect_stats(event);
			
 
				-    clReleaseEvent(event);
			
 
				+    {  /* OpenCL specific code */
			
 
				+	 clFinish(queue);
			
 
				+	 starpu_opencl_collect_stats(event);
			
 
				+	 clReleaseEvent(event);
			
 
				 
			
 
				-    starpu_opencl_release_kernel(kernel);
			
 
				+	 starpu_opencl_release_kernel(kernel);
			
 
				+    }
			
 
				 }
			
 
				 //! [To be included]
			
--- a/doc/doxygen/chapters/configure_options.doxy
+++ b/doc/doxygen/chapters/configure_options.doxy
@@ -329,6 +329,22 @@ Use the compiler <c>mpicc</c> at <c>path</c>, for StarPU-MPI.
 
				 Enable the activity polling method for StarPU-MPI.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--with-coi-dir</dt>
			
 
				+<dd>
			
 
				+\anchor with-coi-dir
			
 
				+\addindex __configure__--with-coi-dir
			
 
				+Specify the directory to the COI library for MIC support.
			
 
				+The default value is <c>/opt/intel/mic/coi</c>
			
 
				+</dd>
			
 
				+
			
 
				+<dt>--mic-host</dt>
			
 
				+<dd>
			
 
				+\anchor mic-host
			
 
				+\addindex __configure__--mic-host
			
 
				+Specify the precise MIC architecture host identifier.
			
 
				+The default value is <c>x86_64-k1om-linux</c>
			
 
				+</dd>
			
 
				+
			
 
				 \section AdvancedConfiguration Advanced Configuration
			
 
				 
			
 
				 <dl>
			
--- a/doc/doxygen/chapters/environment_variables.doxy
+++ b/doc/doxygen/chapters/environment_variables.doxy
@@ -345,31 +345,6 @@ to 0. It is enabled by default or for any other values of the variable
 
				 \ref STARPU_MPI_CACHE.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_MIC_HOST</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_MIC_HOST
			
 
				-\addindex __env__STARPU_MIC_HOST
			
 
				-Defines the value of the parameter <c>--host</c> passed to
			
 
				-<c>configure</c> for the cross-compilation. The current default is
			
 
				-<c>x86_64-k1om-linux</c>.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_MIC_CC_PATH</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_MIC_CC_PATH
			
 
				-\addindex __env__STARPU_MIC_CC_PATH
			
 
				-Defines the path to the MIC cross-compiler. The current default is
			
 
				-<c>/usr/linux-k1om-4.7/bin/</c>
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_COI_DIR</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_COI_DIR
			
 
				-\addindex __env__STARPU_COI_DIR
			
 
				-Defines the path to the COI library. The current default is
			
 
				-<c>/opt/intel/mic/coi</c>.
			
 
				-</dd>
			
 
				-
			
 
				 </dl>
			
 
				 
			
 
				 \section MiscellaneousAndDebug Miscellaneous And Debug
			
--- a/doc/doxygen/chapters/fft_support.doxy
+++ b/doc/doxygen/chapters/fft_support.doxy
@@ -9,7 +9,7 @@
 
				 /*! \page FFTSupport FFT Support
			
 
				 
			
 
				 StarPU provides <c>libstarpufft</c>, a library whose design is very similar to
			
 
				-both fftw and cufft, the difference being that it takes benefit from both CPUs
			
 
				+both <c>fftw</c> and <c>cufft</c>, the difference being that it takes benefit from both CPUs
			
 
				 and GPUs. It should however be noted that GPUs do not have the same precision as
			
 
				 CPUs, so the results may different by a negligible amount.
			
 
				 
			
@@ -33,7 +33,7 @@ The documentation below is given with names for double precision, replace
 
				 
			
 
				 Only complex numbers are supported at the moment.
			
 
				 
			
 
				-The application has to call starpu_init() before calling starpufft functions.
			
 
				+The application has to call starpu_init() before calling <c>starpufft</c> functions.
			
 
				 
			
 
				 Either main memory pointers or data handles can be provided.
			
 
				 
			
@@ -66,6 +66,6 @@ $ pkg-config --cflags starpufft-1.2  # options for the compiler
 
				 $ pkg-config --libs starpufft-1.2    # options for the linker
			
 
				 \endverbatim
			
 
				 
			
 
				-Also pass the <c>--static</c> option if the application is to be linked statically.
			
 
				+Also pass the option <c>--static</c> if the application is to be linked statically.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/mic_scc_support.doxy
+++ b/doc/doxygen/chapters/mic_scc_support.doxy
@@ -13,19 +13,13 @@
 
				 SCC support just needs the presence of the RCCE library.
			
 
				 
			
 
				 MIC support actually needs two compilations of StarPU, one for the host and one for
			
 
				-the device. The script <c>mic-configure</c> can be used to achieve this: it basically
			
 
				+the device. The PATH environment variable has to include the path to the
			
 
				+cross-compilation toolchain, for instance <c>/usr/linux-k1om-4.7/bin</c>
			
 
				+The script <c>mic-configure</c> can then be used to achieve the two compilations: it basically
			
 
				 calls <c>configure</c> as appropriate from two new directories: <c>build_mic</c> and
			
 
				 <c>build_host</c>. <c>make</c> and <c>make install</c> can then be used as usual and will
			
 
				 recurse into both directories.
			
 
				 
			
 
				-\internal
			
 
				-TODO: move to configuration section?
			
 
				-\endinternal
			
 
				-
			
 
				-It can be parameterized with the environment variables \ref
			
 
				-STARPU_MIC_HOST, \ref STARPU_MIC_CC_PATH and \ref STARPU_COI_DIR.
			
 
				-
			
 
				-
			
 
				 \section PortingApplicationsToMICSCC Porting Applications To MIC/SCC
			
 
				 
			
 
				 The simplest way to port an application to MIC/SCC is to set the field
			
@@ -49,8 +43,14 @@ the MIC-cross-built binary. It will look for the file given by the
 
				 environment variable \ref STARPU_MIC_SINK_PROGRAM_NAME or in the
			
 
				 directory given by the environment variable \ref
			
 
				 STARPU_MIC_SINK_PROGRAM_PATH, or in the field
			
 
				-starpu_config::mic_sink_program_path. It will also look in the current
			
 
				+starpu_conf::mic_sink_program_path. It will also look in the current
			
 
				 directory for the same binary name plus the suffix <c>-mic</c> or
			
 
				 <c>_mic</c>.
			
 
				 
			
 
				+The testsuite can be started by simply running <c>make check</c> from the
			
 
				+top directory. It will recurse into both <c>build_host</c> to run tests with only
			
 
				+the host, and into <c>build_mic</c> to run tests with both the host and the MIC
			
 
				+devices. Single tests with the host and the MIC can be run by starting
			
 
				+<c>./loader-cross.sh ./the_test</c> from <c>build_mic/tests</c>.
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/mpi_support.doxy
+++ b/doc/doxygen/chapters/mpi_support.doxy
@@ -31,7 +31,7 @@ $ pkg-config --cflags starpumpi-1.2  # options for the compiler
 
				 $ pkg-config --libs starpumpi-1.2    # options for the linker
			
 
				 \endverbatim
			
 
				 
			
 
				-You also need pass the <c>--static</c> option if the application is to
			
 
				+You also need pass the option <c>--static</c> if the application is to
			
 
				 be linked statically.
			
 
				 
			
 
				 \code{.c}
			
@@ -257,7 +257,7 @@ int my_distrib(int x, int y, int nb_nodes) {
 
				 
			
 
				 Now the data can be registered within StarPU. Data which are not
			
 
				 owned but will be needed for computations can be registered through
			
 
				-the lazy allocation mechanism, i.e. with a <c>home_node</c> set to -1.
			
 
				+the lazy allocation mechanism, i.e. with a <c>home_node</c> set to <c>-1</c>.
			
 
				 StarPU will automatically allocate the memory when it is used for the
			
 
				 first time.
			
 
				 
			
--- a/doc/doxygen/chapters/optimize_performance.doxy
+++ b/doc/doxygen/chapters/optimize_performance.doxy
@@ -37,7 +37,7 @@ starpu_data_set_wt_mask(img_handle, 1<<0);
 
				 \endcode
			
 
				 
			
 
				 will for instance request to always automatically transfer a replicate into the
			
 
				-main memory (node 0), as bit 0 of the write-through bitmask is being set.
			
 
				+main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_data_set_wt_mask(img_handle, ~0U);
			
@@ -108,7 +108,7 @@ possibility according to task size, one can run
 
				 speedup of independent tasks of very small sizes.
			
 
				 
			
 
				 The choice of scheduler also has impact over the overhead: for instance, the
			
 
				-<c>dmda</c> scheduler takes time to make a decision, while <c>eager</c> does
			
 
				+ scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
			
 
				 not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
			
 
				 impact that has on the target machine.
			
 
				 
			
@@ -132,7 +132,7 @@ priority information to StarPU.
 
				 
			
 
				 \section TaskSchedulingPolicy Task Scheduling Policy
			
 
				 
			
 
				-By default, StarPU uses the <c>eager</c> simple greedy scheduler. This is
			
 
				+By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
			
 
				 because it provides correct load balance even if the application codelets do not
			
 
				 have performance models. If your application codelets have performance models
			
 
				 (\ref PerformanceModelExample), you should change the scheduler thanks
			
@@ -276,14 +276,14 @@ and in Joules for the energy consumption models.
 
				 
			
 
				 Distributing tasks to balance the load induces data transfer penalty. StarPU
			
 
				 thus needs to find a balance between both. The target function that the
			
 
				-<c>dmda</c> scheduler of StarPU
			
 
				+scheduler <c>dmda</c> of StarPU
			
 
				 tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
			
 
				 <c>T_execution</c> is the estimated execution time of the codelet (usually
			
 
				 accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
			
 
				 latter is estimated based on bus calibration before execution start,
			
 
				 i.e. with an idle machine, thus without contention. You can force bus
			
 
				 re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
			
 
				-beta parameter defaults to 1, but it can be worth trying to tweak it
			
 
				+beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
			
 
				 by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
			
 
				 real application execution, contention makes transfer times bigger.
			
 
				 This is of course imprecise, but in practice, a rough estimation
			
@@ -291,7 +291,7 @@ already gives the good results that a precise estimation would give.
 
				 
			
 
				 \section DataPrefetch Data Prefetch
			
 
				 
			
 
				-The <c>heft</c>, <c>dmda</c> and <c>pheft</c> scheduling policies
			
 
				+The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
			
 
				 perform data prefetch (see \ref STARPU_PREFETCH):
			
 
				 as soon as a scheduling decision is taken for a task, requests are issued to
			
 
				 transfer its required data to the target processing unit, if needeed, so that
			
@@ -310,9 +310,9 @@ the handle and the desired target memory node.
 
				 \section Power-basedScheduling Power-based Scheduling
			
 
				 
			
 
				 If the application can provide some power performance model (through
			
 
				-the <c>power_model</c> field of the codelet structure), StarPU will
			
 
				+the field starpu_codelet::power_model), StarPU will
			
 
				 take it into account when distributing tasks. The target function that
			
 
				-the <c>dmda</c> scheduler minimizes becomes <c>alpha * T_execution +
			
 
				+the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
			
 
				 beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
			
 
				 is the estimated task consumption in Joules. To tune this parameter, use
			
 
				 <c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
			
@@ -333,7 +333,7 @@ On-line task consumption measurement is currently only supported through the
 
				 <c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
			
 
				 simulator. Applications can however provide explicit measurements by
			
 
				 using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
			
 
				-with the <c>power_model</c> performance model. Fine-grain
			
 
				+with the <c>power_model</c> performance model). Fine-grain
			
 
				 measurement is often not feasible with the feedback provided by the hardware, so
			
 
				 the user can for instance run a given task a thousand times, measure the global
			
 
				 consumption for that series of tasks, divide it by a thousand, repeat for
			
@@ -446,9 +446,9 @@ $ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
 
				 TEST PASSED
			
 
				 \endverbatim
			
 
				 
			
 
				-Note that we force to use the dmda scheduler to generate performance
			
 
				-models for the application. The application may need to be run several
			
 
				-times before the model is calibrated.
			
 
				+Note that we force to use the scheduler <c>dmda</c> to generate
			
 
				+performance models for the application. The application may need to be
			
 
				+run several times before the model is calibrated.
			
 
				 
			
 
				 \subsection Simulation Simulation
			
 
				 
			
--- a/doc/doxygen/chapters/performance_feedback.doxy
+++ b/doc/doxygen/chapters/performance_feedback.doxy
@@ -16,7 +16,7 @@ nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
 
				 install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
			
 
				 <c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
			
 
				 sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
			
 
				-to your application, any options you want to pass it, the path to libayudame.so.
			
 
				+to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
			
 
				 
			
 
				 Make sure to specify at least the same number of CPUs in the dialog box as your
			
 
				 machine has, otherwise an error will happen during execution. Future versions
			
@@ -35,7 +35,7 @@ call starpu_profiling_status_set() with the parameter
 
				 is already enabled or not by calling starpu_profiling_status_get().
			
 
				 Enabling monitoring also reinitialize all previously collected
			
 
				 feedback. The environment variable \ref STARPU_PROFILING can also be
			
 
				-set to 1 to achieve the same effect.
			
 
				+set to <c>1</c> to achieve the same effect.
			
 
				 
			
 
				 Likewise, performance monitoring is stopped by calling
			
 
				 starpu_profiling_status_set() with the parameter
			
@@ -247,7 +247,7 @@ Or you can simply point the <c>PKG_CONFIG_PATH</c> to
 
				 \ref with-fxt "--with-fxt" to <c>./configure</c>
			
 
				 
			
 
				 When FxT is enabled, a trace is generated when StarPU is terminated by calling
			
 
				-starpu_shutdown()). The trace is a binary file whose name has the form
			
 
				+starpu_shutdown(). The trace is a binary file whose name has the form
			
 
				 <c>prof_file_XXX_YYY</c> where <c>XXX</c> is the user name, and
			
 
				 <c>YYY</c> is the pid of the process that used StarPU. This file is saved in the
			
 
				 <c>/tmp/</c> directory by default, or by the directory specified by
			
@@ -269,7 +269,7 @@ application shutdown.
 
				 This will create a file <c>paje.trace</c> in the current directory that
			
 
				 can be inspected with the <a href="http://vite.gforge.inria.fr/">ViTE trace
			
 
				 visualizing open-source tool</a>.  It is possible to open the
			
 
				-<c>paje.trace</c> file with ViTE by using the following command:
			
 
				+file <c>paje.trace</c> with ViTE by using the following command:
			
 
				 
			
 
				 \verbatim
			
 
				 $ vite paje.trace
			
@@ -322,7 +322,7 @@ generate an activity trace by calling:
 
				 $ starpu_fxt_tool -i filename
			
 
				 \endverbatim
			
 
				 
			
 
				-This will create an <c>activity.data</c> file in the current
			
 
				+This will create a file <c>activity.data</c> in the current
			
 
				 directory. A profile of the application showing the activity of StarPU
			
 
				 during the execution of the program can be generated:
			
 
				 
			
@@ -341,7 +341,7 @@ efficiently. The black sections indicate that the processing unit was blocked
 
				 because there was no task to process: this may indicate a lack of parallelism
			
 
				 which may be alleviated by creating more tasks when it is possible.
			
 
				 
			
 
				-The second part of the <c>activity.eps</c> picture is a graph showing the
			
 
				+The second part of the picture <c>activity.eps</c> is a graph showing the
			
 
				 evolution of the number of tasks available in the system during the execution.
			
 
				 Ready tasks are shown in black, and tasks that are submitted but not
			
 
				 schedulable yet are shown in grey.
			
@@ -360,8 +360,8 @@ file: <starpu_slu_lu_model_22.hannibal>
 
				 file: <starpu_slu_lu_model_12.hannibal>
			
 
				 \endverbatim
			
 
				 
			
 
				-Here, the codelets of the lu example are available. We can examine the
			
 
				-performance of the 22 kernel (in micro-seconds), which is history-based:
			
 
				+Here, the codelets of the example <c>lu</c> are available. We can examine the
			
 
				+performance of the kernel <c>22</c> (in micro-seconds), which is history-based:
			
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_perfmodel_display -s starpu_slu_lu_model_22
			
@@ -414,7 +414,7 @@ starpu_perfmodel_load_symbol(). The source code of the tool
 
				 
			
 
				 The tool <c>starpu_perfmodel_plot</c> can be used to draw performance
			
 
				 models. It writes a <c>.gp</c> file in the current directory, to be
			
 
				-run in the <c>gnuplot</c> tool, which shows the corresponding curve.
			
 
				+run with the tool <c>gnuplot</c>, which shows the corresponding curve.
			
 
				 
			
 
				 When the field starpu_task::flops is set, <c>starpu_perfmodel_plot</c> can
			
 
				 directly draw a GFlops curve, by simply adding the <c>-f</c> option:
			
@@ -448,13 +448,13 @@ $ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_
 
				 It will produce a <c>.gp</c> file which contains both the performance model
			
 
				 curves, and the profiling measurements.
			
 
				 
			
 
				-If you have the <c>R</c> statistical tool installed, you can additionally use
			
 
				+If you have the statistical tool <c>R</c> installed, you can additionally use
			
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_codelet_histo_profile distrib.data
			
 
				 \endverbatim
			
 
				 
			
 
				-Which will create one pdf file per codelet and per input size, showing a
			
 
				+Which will create one <c>.pdf</c> file per codelet and per input size, showing a
			
 
				 histogram of the codelet execution time distribution.
			
 
				 
			
 
				 \section TheoreticalLowerBoundOnExecutionTime Theoretical Lower Bound On Execution Time
			
@@ -475,13 +475,13 @@ use this.
 
				 \section MemoryFeedback Memory Feedback
			
 
				 
			
 
				 It is possible to enable memory statistics. To do so, you need to pass
			
 
				-the option \ref enable-memory-stats "--enable-memory-stats" when running configure. It is then
			
 
				-possible to call the function starpu_display_memory_stats() to
			
 
				+the option \ref enable-memory-stats "--enable-memory-stats" when running <c>configure</c>. It is then
			
 
				+possible to call the function starpu_data_display_memory_stats() to
			
 
				 display statistics about the current data handles registered within StarPU.
			
 
				 
			
 
				 Moreover, statistics will be displayed at the end of the execution on
			
 
				 data handles which have not been cleared out. This can be disabled by
			
 
				-setting the environment variable \ref STARPU_MEMORY_STATS to 0.
			
 
				+setting the environment variable \ref STARPU_MEMORY_STATS to <c>0</c>.
			
 
				 
			
 
				 For example, if you do not unregister data at the end of the complex
			
 
				 example, you will get something similar to:
			
@@ -552,7 +552,7 @@ of the application. To enable them, you need to pass the option
 
				 starpu_shutdown() various statistics will be displayed,
			
 
				 execution, MSI cache statistics, allocation cache statistics, and data
			
 
				 transfer statistics. The display can be disabled by setting the
			
 
				-environment variable \ref STARPU_STATS to 0.
			
 
				+environment variable \ref STARPU_STATS to <c>0</c>.
			
 
				 
			
 
				 \verbatim
			
 
				 $ ./examples/cholesky/cholesky_tag
			
--- a/doc/doxygen/dev/checkDoc.sh
+++ b/doc/doxygen/dev/checkDoc.sh
@@ -0,0 +1,12 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+x=$(grep ingroup chapters/api/*|awk -F':' '{print $2}'| awk 'NF != 2')
			
 
				+if test -n "$x" ; then
			
 
				+    echo Errors on group definitions
			
 
				+    echo $x
			
 
				+fi
			
 
				+
			
 
				+echo
			
 
				+echo "Defined groups"
			
 
				+grep ingroup chapters/api/*|awk -F':' '{print $2}'| awk 'NF == 2'|sort|uniq
			
 
				+echo
			
--- a/doc/doxygen/dev/starpu_check_documented.py
+++ b/doc/doxygen/dev/starpu_check_documented.py
@@ -0,0 +1,38 @@
 
				+#!/usr/bin/python
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+class bcolors:
			
 
				+    FAILURE = '\033[91m'
			
 
				+    NORMAL = '\033[0m'
			
 
				+
			
 
				+def loadFunctionsAndDatatypes(flist, dtlist, fname):
			
 
				+    f = open(fname, 'r')
			
 
				+    for line in f:
			
 
				+        mline = line[:-1]
			
 
				+        if mline.count("\\fn"):
			
 
				+            if mline.count("fft") == 0:
			
 
				+                func = mline.replace("\\fn ", "")
			
 
				+                flist.append(list([func, fname]))
			
 
				+        if mline.count("\\struct ") or mline.count("\\def ") or mline.count("\\typedef ") or mline.count("\\enum "):
			
 
				+            datatype = mline.replace("\\struct ", "").replace("\\def ", "").replace("\\typedef ", "").replace("\\enum ","")
			
 
				+            dtlist.append(list([datatype, fname]))
			
 
				+    f.close()
			
 
				+
			
 
				+functions = []
			
 
				+datatypes = []
			
 
				+
			
 
				+for docfile in os.listdir('chapters/api'):
			
 
				+    if docfile.count(".doxy"):
			
 
				+        loadFunctionsAndDatatypes(functions, datatypes, "chapters/api/"+docfile)
			
 
				+
			
 
				+for function in functions:
			
 
				+    x = os.system("fgrep -l \"" + function[0] + "\" ../../include/*.h ../../mpi/include/*.h ../../starpufft/*h ../../sc_hypervisor/include/*.h > /dev/null")
			
 
				+    if x != 0:
			
 
				+        print "Function <" + bcolors.FAILURE + function[0] + bcolors.NORMAL + "> documented in <" + function[1] + "> does not exist in StarPU's API"
			
 
				+
			
 
				+for datatype in datatypes:
			
 
				+    x = os.system("fgrep -l \"" + datatype[0] + "\" ../../include/*.h ../../mpi/include/*.h ../../starpufft/*h ../../sc_hypervisor/include/*.h > /dev/null")
			
 
				+    if x != 0:
			
 
				+        print "Datatype <" + bcolors.FAILURE + datatype[0] + bcolors.NORMAL + "> documented in <" + datatype[1] + "> does not exist in StarPU's API"
			
 
				+
			
--- a/doc/doxygen/dev/starpu_check_undocumented.sh
+++ b/doc/doxygen/dev/starpu_check_undocumented.sh
@@ -0,0 +1,78 @@
 
				+#!/bin/bash
			
 
				+# Note: expects Coccinelle's spatch command n the PATH
			
 
				+# See: http://coccinelle.lip6.fr/
			
 
				+
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2011, 2012, 2013 Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+stcolor=$(tput sgr0)
			
 
				+redcolor=$(tput setaf 1)
			
 
				+greencolor=$(tput setaf 2)
			
 
				+
			
 
				+H_FILES=$(find include mpi/include -name '*.h')
			
 
				+
			
 
				+functions=$(spatch -very_quiet -sp_file tools/dev/starpu_funcs.cocci $H_FILES)
			
 
				+for func in $functions ; do
			
 
				+	fname=$(echo $func|awk -F ',' '{print $1}')
			
 
				+	location=$(echo $func|awk -F ',' '{print $2}')
			
 
				+	x=$(grep "$fname(" doc/doxygen/chapters/api/*.doxy | grep "\\fn")
			
 
				+	if test "$x" == "" ; then
			
 
				+		echo "function ${redcolor}${fname}${stcolor} at location ${redcolor}$location${stcolor} is not (or incorrectly) documented"
			
 
				+#	else
			
 
				+#		echo "function ${greencolor}${fname}${stcolor} at location $location is correctly documented"
			
 
				+	fi
			
 
				+done
			
 
				+
			
 
				+echo
			
 
				+
			
 
				+structs=$(grep "struct starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
			
 
				+for struct in $structs ; do
			
 
				+    x=$(grep -F "\\struct $struct" doc/doxygen/chapters/api/*.doxy)
			
 
				+    if test "$x" == "" ; then
			
 
				+	echo "struct ${redcolor}${struct}${stcolor} is not (or incorrectly) documented"
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+echo
			
 
				+
			
 
				+enums=$(grep "enum starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
			
 
				+for enum in $enums ; do
			
 
				+    x=$(grep -F "\\enum $enum" doc/doxygen/chapters/api/*.doxy)
			
 
				+    if test "$x" == "" ; then
			
 
				+	echo "enum ${redcolor}${enum}${stcolor} is not (or incorrectly) documented"
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+echo
			
 
				+
			
 
				+macros=$(grep "define\b" $H_FILES |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
			
 
				+for macro in $macros ; do
			
 
				+    x=$(grep -F "\\def $macro" doc/doxygen/chapters/api/*.doxy)
			
 
				+    if test "$x" == "" ; then
			
 
				+	echo "macro ${redcolor}${macro}${stcolor} is not (or incorrectly) documented"
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+echo
			
 
				+
			
 
				+variables=$(grep --exclude-dir=.svn -rs -E "(getenv|get_env)" src/| tr ' ' '\012'|grep -E "(getenv|get_env)" | grep "\"" | sed 's/.*("//' | sed 's/").*//'|sort|uniq)
			
 
				+for variable in $variables ; do
			
 
				+    x=$(grep "$variable" doc/doxygen/chapters/environment_variables.doxy | grep "\\anchor")
			
 
				+    if test "$x" == "" ; then
			
 
				+	echo "variable ${redcolor}${variable}${stcolor} is not (or incorrectly) documented"
			
 
				+    fi
			
 
				+done
			
 
				+
			
--- a/doc/doxygen/dev/starpu_funcs.cocci
+++ b/doc/doxygen/dev/starpu_funcs.cocci
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -114,7 +114,7 @@ Documentation License”.
 
				 \hypertarget{AdvancedExamples}{}
			
 
				 \input{AdvancedExamples}
			
 
				 
			
 
				-\chapter{How to optimize performance with StarPU}
			
 
				+\chapter{How To Optimize Performance With StarPU}
			
 
				 \label{HowToOptimizePerformanceWithStarPU}
			
 
				 \hypertarget{HowToOptimizePerformanceWithStarPU}{}
			
 
				 \input{HowToOptimizePerformanceWithStarPU}
			
--- a/doc/texinfo/dev/starpu_check_documented.py
+++ b/doc/texinfo/dev/starpu_check_documented.py
--- a/doc/texinfo/dev/starpu_check_undocumented.sh
+++ b/doc/texinfo/dev/starpu_check_undocumented.sh
--- a/doc/texinfo/dev/starpu_funcs.cocci
+++ b/doc/texinfo/dev/starpu_funcs.cocci
@@ -0,0 +1,28 @@
 
				+// StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+//
			
 
				+// Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+//
			
 
				+// StarPU is free software; you can redistribute it and/or modify
			
 
				+// it under the terms of the GNU Lesser General Public License as published by
			
 
				+// the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+// your option) any later version.
			
 
				+//
			
 
				+// StarPU is distributed in the hope that it will be useful, but
			
 
				+// WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+//
			
 
				+// See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+@starpufunc@
			
 
				+position p;
			
 
				+type t;
			
 
				+identifier f =~ "starpu";
			
 
				+@@
			
 
				+
			
 
				+t f@p( ... );
			
 
				+
			
 
				+@ script:python @
			
 
				+p << starpufunc.p;
			
 
				+f << starpufunc.f;
			
 
				+@@
			
 
				+print "%s,%s:%s" % (f,p[0].file,p[0].line)
			
--- a/examples/spmv/matrix_market/mmio.c
+++ b/examples/spmv/matrix_market/mmio.c
@@ -277,8 +277,8 @@ int mm_write_mtx_array_size(FILE *f, int M, int N)
 
				 /* use when I[], J[], and val[]J, and val[] are already allocated */
			
 
				 /******************************************************************/
			
 
				 
			
 
				-int mm_read_mtx_crd_data(FILE *f, int M 
			
 
				-				 int N, int nz, int I[], int J[],
			
 
				+int mm_read_mtx_crd_data(FILE *f, int M,
			
 
				+			 int N, int nz, int I[], int J[],
			
 
				         double val[], MM_typecode matcode)
			
 
				 {
			
 
				     int i;
			
--- a/examples/stencil/life_opencl.c
+++ b/examples/stencil/life_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,7 +19,11 @@
 
				 /* #define _externC extern "C" */
			
 
				 
			
 
				 #include <stencil.h>
			
 
				+#ifdef __APPLE__
			
 
				+#include <OpenCL/cl.h>
			
 
				+#else
			
 
				 #include <CL/cl.h>
			
 
				+#endif
			
 
				 #include <starpu.h>
			
 
				 
			
 
				 #define str(x) #x
			
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -61,12 +61,12 @@ void starpu_opencl_get_current_queue(cl_command_queue *queue);
 
				 
			
 
				 void starpu_opencl_load_program_source(const char *source_file_name, char *located_file_name, char *located_dir_name, char *opencl_program_source);
			
 
				 int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char *build_options);
			
 
				-int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char* build_options);
			
 
				+int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char *build_options);
			
 
				 
			
 
				 int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs);
			
 
				 
			
 
				-int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options);
			
 
				-int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char* build_options);
			
 
				+int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options);
			
 
				+int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char *build_options);
			
 
				 int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
			
 
				 
			
 
				 int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, const char *kernel_name, int devid);
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -67,14 +67,14 @@ unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
 
				 
			
 
				 void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
			
 
				 
			
 
				-void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
			
 
				+void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
			
 
				 
			
 
				 
			
 
				-struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
			
 
				+struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
			
 
				 
			
 
				 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
			
 
				 
			
 
				-struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
			
 
				+struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
			
 
				 
			
 
				 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
			
 
				 
			
@@ -112,7 +112,7 @@ int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio);
 
				 #define STARPU_DEFAULT_PRIO	0
			
 
				 
			
 
				 /* execute any parallel code on the workers of the sched_ctx (workers are blocked) */
			
 
				-void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, unsigned sched_ctx_id);
			
 
				+void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -239,7 +239,7 @@ void starpu_codelet_display_stats(struct starpu_codelet *cl);
 
				 
			
 
				 struct starpu_task *starpu_task_get_current(void);
			
 
				 
			
 
				-void starpu_parallel_task_barrier_init(struct starpu_task* task, int workerid);
			
 
				+void starpu_parallel_task_barrier_init(struct starpu_task *task, int workerid);
			
 
				 
			
 
				 struct starpu_task *starpu_task_dup(struct starpu_task *task);
			
 
				 
			
--- a/include/starpu_task_util.h
+++ b/include/starpu_task_util.h
@@ -29,7 +29,7 @@ extern "C"
 
				 {
			
 
				 #endif
			
 
				 
			
 
				-void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps,	void (*callback)(void *), void *callback_arg);
			
 
				+void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
			
 
				 
			
 
				 #define STARPU_VALUE		 (1<<19)
			
 
				 #define STARPU_CALLBACK		 (1<<20)
			
--- a/include/starpu_top.h
+++ b/include/starpu_top.h
@@ -82,50 +82,22 @@ enum starpu_top_message_type
 
				 	TOP_TYPE_UNKNOW
			
 
				 };
			
 
				 
			
 
				-struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name,
			
 
				-						    int active);
			
 
				-struct starpu_top_data *starpu_top_add_data_integer(const char *data_name,
			
 
				-						     int minimum_value,
			
 
				-						     int maximum_value,
			
 
				-						     int active);
			
 
				-struct starpu_top_data *starpu_top_add_data_float(const char *data_name,
			
 
				-						  double minimum_value,
			
 
				-						  double maximum_value,
			
 
				-						  int active);
			
 
				-struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name,
			
 
				-							       int *parameter_field,
			
 
				-							       void (*callback)(struct starpu_top_param*));
			
 
				-struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name,
			
 
				-							       int *parameter_field,
			
 
				-							       int minimum_value,
			
 
				-							       int maximum_value,
			
 
				-							       void (*callback)(struct starpu_top_param*));
			
 
				-struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name,
			
 
				-							     double *parameter_field,
			
 
				-							     double minimum_value,
			
 
				-							     double maximum_value,
			
 
				-							     void (*callback)(struct starpu_top_param*));
			
 
				-struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name,
			
 
				-							    int *parameter_field,
			
 
				-							    char **values,
			
 
				-							    int nb_values,
			
 
				-							    void (*callback)(struct starpu_top_param*));
			
 
				-
			
 
				-
			
 
				+struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name, int active);
			
 
				+struct starpu_top_data *starpu_top_add_data_integer(const char *data_name, int minimum_value, int maximum_value, int active);
			
 
				+struct starpu_top_data *starpu_top_add_data_float(const char *data_name, double minimum_value, double maximum_value, int active);
			
 
				 
			
 
				+struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name, int *parameter_field, void (*callback)(struct starpu_top_param*));
			
 
				+struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name, int *parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*));
			
 
				+struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name, double *parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*));
			
 
				+struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name, int *parameter_field, char **values, int nb_values, void (*callback)(struct starpu_top_param*));
			
 
				 
			
 
				 void starpu_top_init_and_wait(const char *server_name);
			
 
				 
			
 
				 void starpu_top_update_parameter(const struct starpu_top_param *param);
			
 
				-void starpu_top_update_data_boolean(const struct starpu_top_data *data,
			
 
				-				    int value);
			
 
				-void starpu_top_update_data_integer(const struct starpu_top_data *data,
			
 
				-				    int value);
			
 
				-void starpu_top_update_data_float(const struct starpu_top_data *data,
			
 
				-				  double value);
			
 
				-void starpu_top_task_prevision(struct starpu_task *task,
			
 
				-			       int devid, unsigned long long start,
			
 
				-			       unsigned long long end);
			
 
				+void starpu_top_update_data_boolean(const struct starpu_top_data *data, int value);
			
 
				+void starpu_top_update_data_integer(const struct starpu_top_data *data, int value);
			
 
				+void starpu_top_update_data_float(const struct starpu_top_data *data, double value);
			
 
				+void starpu_top_task_prevision(struct starpu_task *task, int devid, unsigned long long start, unsigned long long end);
			
 
				 
			
 
				 void starpu_top_debug_log(const char *message);
			
 
				 void starpu_top_debug_lock(const char *message);
			
--- a/mic-configure
+++ b/mic-configure
@@ -1,12 +1,6 @@
 
				 #!/bin/bash
			
 
				 
			
 
				 ROOT_DIR=$PWD
			
 
				-[ -n "$STARPU_MIC_HOST" ] || STARPU_MIC_HOST=x86_64-k1om-linux
			
 
				-[ -n "$STARPU_MIC_CC_PATH" ] || STARPU_MIC_CC_PATH=/usr/linux-k1om-4.7/bin/
			
 
				-[ -n "$STARPU_COI_DIR" ] || STARPU_COI_DIR=/opt/intel/mic/coi
			
 
				-DEFAULT_PREFIX=/usr/local
			
 
				-
			
 
				-export PATH=${STARPU_MIC_CC_PATH}${PATH:+:${PATH}}
			
 
				 
			
 
				 cat > ./mic-config.log << EOF
			
 
				 This file was created by StarPU mic-configure
			
@@ -14,38 +8,39 @@ This file was created by StarPU mic-configure
 
				  $ $0 $*
			
 
				 EOF
			
 
				 
			
 
				-for arch in mic host
			
 
				+prefix="/usr/local"
			
 
				+coi_dir="/opt/intel/mic/coi"
			
 
				+mic_host="x86_64-k1om-linux"
			
 
				+
			
 
				+for arg in $*
			
 
				 do
			
 
				+	case $arg in 
			
 
				+		--prefix=*)
			
 
				+			prefix="${arg#--prefix=}"
			
 
				+			;;
			
 
				+		--with-coi-dir=*)
			
 
				+			coi_dir="${arg#--with-coi-dir=}"
			
 
				+			;;
			
 
				+		--mic-host=*)
			
 
				+			mic_host="${arg#--mic-host=}"
			
 
				+			;;
			
 
				+	esac
			
 
				+
			
 
				+done
			
 
				 
			
 
				+for arch in mic host
			
 
				+do
			
 
				 	# We call the configure script from a build directory further in the
			
 
				 	# arborescence
			
 
				-	command="${ROOT_DIR}/configure --enable-mic --with-coi-dir=$STARPU_COI_DIR"
			
 
				-	prefix_found=no
			
 
				+
			
 
				+	command="${ROOT_DIR}/configure"
			
 
				+	params="--enable-mic --with-coi-dir=$coi_dir --prefix=$prefix/$arch"
			
 
				 
			
 
				 	if test x$arch = xmic ; then
			
 
				-		command="$command --without-hwloc --with-coi-lib-dir=$STARPU_COI_DIR/device-linux-release/lib --host=$STARPU_MIC_HOST"
			
 
				+		# TODO: fix hwloc detection to look for another pkg-config place, and not just believe in the host version of hwloc.pc...
			
 
				+		params="$params --without-hwloc --with-coi-lib-dir=$coi_dir/device-linux-release/lib --host=$mic_host"
			
 
				 	else
			
 
				-		command="$command --with-coi-lib-dir=$STARPU_COI_DIR/host-linux-release/lib"
			
 
				-	fi
			
 
				-
			
 
				-	for arg in $*
			
 
				-	do
			
 
				-		if [ ${arg:0:9} = '--prefix=' ]
			
 
				-		then
			
 
				-			prefix_found=yes
			
 
				-			prefix="${arg:9}"
			
 
				-			command="$command ${arg}/${arch}"
			
 
				-		else
			
 
				-			command="$command $arg"
			
 
				-		fi
			
 
				-
			
 
				-	done
			
 
				-
			
 
				-	# If the user didn't specify a directory where to install the library
			
 
				-	# we apply the default one
			
 
				-	if test x$prefix_found = xno ; then
			
 
				-		command="$command --prefix=${DEFAULT_PREFIX}/$arch"
			
 
				-		prefix=${DEFAULT_PREFIX}
			
 
				+		params="$params --with-coi-lib-dir=$coi_dir/host-linux-release/lib"
			
 
				 	fi
			
 
				 
			
 
				 	# If the build directory doesn't exist yet, create it
			
@@ -56,9 +51,9 @@ do
 
				 	cd "build_${arch}"
			
 
				 
			
 
				 	if test x$arch = xmic ; then
			
 
				-		LDFLAGS=-export-dynamic $command
			
 
				+		LDFLAGS=-export-dynamic $command $* $params
			
 
				 	else
			
 
				-		$command
			
 
				+		$command $* $params
			
 
				 	fi
			
 
				 	if [ "$?" != 0 ]
			
 
				 	then
			
--- a/mpi/starpumpi-1.1.pc.in
+++ b/mpi/starpumpi-1.1.pc.in
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -25,5 +25,5 @@ Version: @PACKAGE_VERSION@
 
				 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
			
 
				 Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
			
 
				 Libs.private: @LDFLAGS@ @LIBS@
			
 
				-Requires: starpu-1.0
			
 
				+Requires: starpu-1.1
			
 
				 Requires.private:
			
--- a/mpi/starpumpi-1.2.pc.in
+++ b/mpi/starpumpi-1.2.pc.in
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -25,5 +25,5 @@ Version: @PACKAGE_VERSION@
 
				 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
			
 
				 Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
			
 
				 Libs.private: @LDFLAGS@ @LIBS@
			
 
				-Requires: starpu-1.0
			
 
				+Requires: starpu-1.2
			
 
				 Requires.private:
			
--- a/socl/Makefile.am
+++ b/socl/Makefile.am
@@ -17,11 +17,6 @@ SUBDIRS = src examples
 
				 
			
 
				 EXTRA_DIST = README
			
 
				 
			
 
				-libsocl_la_includedir=$(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)/socl/CL
			
 
				-
			
 
				-pkgconfigdir = $(libdir)/pkgconfig
			
 
				-pkgconfig_DATA = socl-1.0.pc socl-1.1.pc
			
 
				-
			
 
				 showcheck:
			
 
				 	for i in $(SUBDIRS) ; do \
			
 
				 		make -C $$i showcheck ; \
			
--- a/socl/README
+++ b/socl/README
@@ -3,5 +3,3 @@ StarPU's OpenCL interface
 
				 
			
 
				 This directory contains an OpenCL implementation that can
			
 
				 be used as a replacement of the classic StarPU's API.
			
 
				-
			
 
				-OpenCL applications need to be compiled using provided headers.
			
--- a/socl/socl-1.0.pc.in
+++ b/socl/socl-1.0.pc.in
@@ -1,29 +0,0 @@
 
				-# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				-#
			
 
				-# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				-#
			
 
				-# StarPU is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU Lesser General Public License as published by
			
 
				-# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				-# your option) any later version.
			
 
				-#
			
 
				-# StarPU is distributed in the hope that it will be useful, but
			
 
				-# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-#
			
 
				-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				-
			
 
				-prefix=@prefix@
			
 
				-exec_prefix=@exec_prefix@
			
 
				-libdir=@libdir@
			
 
				-includedir=@includedir@
			
 
				-
			
 
				-Name: socl
			
 
				-Description: offers OpenCL implementation on top of StarPU
			
 
				-Version: @PACKAGE_VERSION@
			
 
				-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@/socl
			
 
				-Libs: -L${libdir} -lsocl-@STARPU_EFFECTIVE_VERSION@
			
 
				-Libs.private: @LDFLAGS@ @LIBS@
			
 
				-Requires: starpu-1.0
			
 
				-Requires.private:
			
--- a/socl/socl-1.1.pc.in
+++ b/socl/socl-1.1.pc.in
@@ -1,29 +0,0 @@
 
				-# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				-#
			
 
				-# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				-#
			
 
				-# StarPU is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU Lesser General Public License as published by
			
 
				-# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				-# your option) any later version.
			
 
				-#
			
 
				-# StarPU is distributed in the hope that it will be useful, but
			
 
				-# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-#
			
 
				-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				-
			
 
				-prefix=@prefix@
			
 
				-exec_prefix=@exec_prefix@
			
 
				-libdir=@libdir@
			
 
				-includedir=@includedir@
			
 
				-
			
 
				-Name: socl
			
 
				-Description: offers OpenCL implementation on top of StarPU
			
 
				-Version: @PACKAGE_VERSION@
			
 
				-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@/socl
			
 
				-Libs: -L${libdir} -lsocl-@STARPU_EFFECTIVE_VERSION@
			
 
				-Libs.private: @LDFLAGS@ @LIBS@
			
 
				-Requires: starpu-1.0
			
 
				-Requires.private:
			
--- a/socl/socl-1.2.pc.in
+++ b/socl/socl-1.2.pc.in
@@ -1,29 +0,0 @@
 
				-# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				-#
			
 
				-# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				-#
			
 
				-# StarPU is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU Lesser General Public License as published by
			
 
				-# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				-# your option) any later version.
			
 
				-#
			
 
				-# StarPU is distributed in the hope that it will be useful, but
			
 
				-# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-#
			
 
				-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				-
			
 
				-prefix=@prefix@
			
 
				-exec_prefix=@exec_prefix@
			
 
				-libdir=@libdir@
			
 
				-includedir=@includedir@
			
 
				-
			
 
				-Name: socl
			
 
				-Description: offers OpenCL implementation on top of StarPU
			
 
				-Version: @PACKAGE_VERSION@
			
 
				-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@/socl
			
 
				-Libs: -L${libdir} -lsocl-@STARPU_EFFECTIVE_VERSION@
			
 
				-Libs.private: @LDFLAGS@ @LIBS@
			
 
				-Requires: starpu-1.0
			
 
				-Requires.private:
			
--- a/socl/vendors/socl.icd.in
+++ b/socl/vendors/socl.icd.in
@@ -1,2 +1 @@
 
				 @STARPU_BUILD_DIR@/socl/src/.libs/libsocl-@STARPU_EFFECTIVE_VERSION@.so
			
 
				-
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -20,22 +20,24 @@
 
				 #include <datawizard/footprint.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-/* This per-node RW-locks protect mc_list and memchunk_cache entries */
			
 
				-/* Note: handle header lock is always taken before this */
			
 
				-static starpu_pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
			
 
				-
			
 
				 /* This per-node spinlock protect lru_list */
			
 
				 static struct _starpu_spinlock lru_rwlock[STARPU_MAXNODES];
			
 
				 
			
 
				 /* Last Recently used memory chunkgs */
			
 
				 static struct _starpu_mem_chunk_lru_list *starpu_lru_list[STARPU_MAXNODES];
			
 
				 
			
 
				+
			
 
				+/* This per-node RW-locks protect mc_list and memchunk_cache entries */
			
 
				+/* Note: handle header lock is always taken before this */
			
 
				+static starpu_pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
			
 
				+
			
 
				 /* Potentially in use memory chunks */
			
 
				 static struct _starpu_mem_chunk_list *mc_list[STARPU_MAXNODES];
			
 
				 
			
 
				 /* Explicitly caches memory chunks that can be reused */
			
 
				 static struct _starpu_mem_chunk_list *memchunk_cache[STARPU_MAXNODES];
			
 
				 
			
 
				+
			
 
				 /* When reclaiming memory to allocate, we reclaim MAX(what_is_to_reclaim_on_device, data_size_coefficient*data_size) */
			
 
				 const unsigned starpu_memstrategy_data_size_coefficient=2;
			
 
				 
			
@@ -73,22 +75,6 @@ void _starpu_deinit_mem_chunk_lists(void)
 
				  *	Manipulate subtrees
			
 
				  */
			
 
				 
			
 
				-static void lock_all_subtree(starpu_data_handle_t handle)
			
 
				-{
			
 
				-	unsigned child;
			
 
				-
			
 
				-	/* lock parent */
			
 
				-	while (_starpu_spin_trylock(&handle->header_lock))
			
 
				-		_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				-
			
 
				-	/* lock all sub-subtrees children */
			
 
				-	for (child = 0; child < handle->nchildren; child++)
			
 
				-	{
			
 
				-		starpu_data_handle_t child_handle = starpu_data_get_child(handle, child);
			
 
				-		lock_all_subtree(child_handle);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static void unlock_all_subtree(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	/* lock all sub-subtrees children
			
@@ -105,6 +91,30 @@ static void unlock_all_subtree(starpu_data_handle_t handle)
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				+static int lock_all_subtree(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	int child;
			
 
				+
			
 
				+	/* lock parent */
			
 
				+	if (_starpu_spin_trylock(&handle->header_lock))
			
 
				+		/* the handle is busy, abort */
			
 
				+		return 0;
			
 
				+
			
 
				+	/* lock all sub-subtrees children */
			
 
				+	for (child = 0; child < (int) handle->nchildren; child++)
			
 
				+	{
			
 
				+		if (!lock_all_subtree(starpu_data_get_child(handle, child))) {
			
 
				+			/* Some child is busy, abort */
			
 
				+			while (--child >= 0)
			
 
				+				/* Unlock what we have already uselessly locked */
			
 
				+				unlock_all_subtree(starpu_data_get_child(handle, child));
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				 static unsigned may_free_subtree(starpu_data_handle_t handle, unsigned node)
			
 
				 {
			
 
				 	/* we only free if no one refers to the leaf */
			
@@ -332,8 +342,9 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
				 	{
			
 
				 		STARPU_ASSERT(mc->replicate);
			
 
				 
			
 
				-		while (_starpu_spin_trylock(&handle->header_lock))
			
 
				-			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				+		if (_starpu_spin_trylock(&handle->header_lock))
			
 
				+			/* Handle is busy, abort */
			
 
				+			return 0;
			
 
				 
			
 
				 		if (mc->replicate->refcnt == 0)
			
 
				 		{
			
@@ -349,10 +360,8 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
				 	else
			
 
				 	{
			
 
				 		/* try to lock all the subtree */
			
 
				-		lock_all_subtree(handle);
			
 
				-	      
			
 
				-		/* check if they are all "free" */
			
 
				-		if (may_free_subtree(handle, node))
			
 
				+		/* and check if they are all "free" */
			
 
				+		if (lock_all_subtree(handle) && may_free_subtree(handle, node))
			
 
				 		{
			
 
				 			int target = -1;
			
 
				 
			
@@ -381,10 +390,10 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
				 				/* now the actual buffer may be freed */
			
 
				 				freed = do_free_mem_chunk(mc, node);
			
 
				 			}
			
 
				-		}
			
 
				 
			
 
				-		/* unlock the leafs */
			
 
				-		unlock_all_subtree(handle);
			
 
				+			/* unlock the tree */
			
 
				+			unlock_all_subtree(handle);
			
 
				+		}
			
 
				 	}
			
 
				 	return freed;
			
 
				 }
			
@@ -439,10 +448,8 @@ static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned no
 
				 	STARPU_ASSERT(old_data);
			
 
				 
			
 
				 	/* try to lock all the subtree */
			
 
				-	lock_all_subtree(old_data);
			
 
				-
			
 
				-	/* check if they are all "free" */
			
 
				-	if (may_free_subtree(old_data, node))
			
 
				+	/* and check if they are all "free" */
			
 
				+	if (lock_all_subtree(old_data) && may_free_subtree(old_data, node))
			
 
				 	{
			
 
				 		success = 1;
			
 
				 
			
@@ -452,10 +459,10 @@ static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned no
 
				 
			
 
				 		/* now replace the previous data */
			
 
				 		reuse_mem_chunk(node, replicate, mc, is_already_in_mc_list);
			
 
				-	}
			
 
				 
			
 
				-	/* unlock the leafs */
			
 
				-	unlock_all_subtree(old_data);
			
 
				+		/* unlock the tree */
			
 
				+		unlock_all_subtree(old_data);
			
 
				+	}
			
 
				 
			
 
				 	return success;
			
 
				 }
			
@@ -545,19 +552,27 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
 
				 static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
			
 
				 {
			
 
				 	struct _starpu_mem_chunk *mc;
			
 
				+	struct _starpu_mem_chunk_list *busy_memchunk_cache;
			
 
				 
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				+	if (_starpu_mem_chunk_list_empty(memchunk_cache[node]))
			
 
				+		return 0;
			
 
				+
			
 
				+	busy_memchunk_cache = _starpu_mem_chunk_list_new();
			
 
				+
			
 
				 	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				 	while (!_starpu_mem_chunk_list_empty(memchunk_cache[node])) {
			
 
				 		mc = _starpu_mem_chunk_list_pop_front(memchunk_cache[node]);
			
 
				-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				-
			
 
				 		starpu_data_handle_t handle = mc->data;
			
 
				 
			
 
				 		if (handle)
			
 
				-			while (_starpu_spin_trylock(&handle->header_lock))
			
 
				-				_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				+			if (_starpu_spin_trylock(&handle->header_lock)) {
			
 
				+				/* The handle is still busy, leave this chunk for later */
			
 
				+				_starpu_mem_chunk_list_push_front(busy_memchunk_cache, mc);
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				 		freed += free_memory_on_node(mc, node);
			
 
				 		if (handle)
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
@@ -565,10 +580,11 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 
				 		free(mc->chunk_interface);
			
 
				 		_starpu_mem_chunk_delete(mc);
			
 
				 
			
 
				-		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				-		if (reclaim && freed>reclaim)
			
 
				+		if (reclaim && freed >= reclaim)
			
 
				 			break;
			
 
				 	}
			
 
				+	_starpu_mem_chunk_list_push_list_front(busy_memchunk_cache, memchunk_cache[node]);
			
 
				+	_starpu_mem_chunk_list_delete(busy_memchunk_cache);
			
 
				 	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				 	return freed;
			
 
				 }
			
@@ -583,7 +599,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 
				 {
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				-	struct _starpu_mem_chunk *mc, *next_mc = (void*) -1;
			
 
				+	struct _starpu_mem_chunk *mc, *next_mc;
			
 
				 
			
 
				 	/*
			
 
				 	 * We have to unlock mc_rwlock before locking header_lock, so we have
			
@@ -593,50 +609,37 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 
				 	 * finding anything to free.
			
 
				 	 */
			
 
				 
			
 
				-	while (1)
			
 
				-	{
			
 
				-		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				-
			
 
				-		if (_starpu_mem_chunk_list_empty(mc_list[node]) || !next_mc)
			
 
				-		{
			
 
				-			STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				-			/* We reached the end of the list :/ */
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		if (next_mc == (void*) -1) {
			
 
				-			/* First iteration ever, start from beginning */
			
 
				-			mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-		} else {
			
 
				-			/* Try to restart from where we were */
			
 
				-			for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-			     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				-			     mc = _starpu_mem_chunk_list_next(mc))
			
 
				-				if (mc == next_mc)
			
 
				-					/* Found it, restart from there.  */
			
 
				-					break;
			
 
				-
			
 
				-			if (mc == _starpu_mem_chunk_list_end(mc_list[node]))
			
 
				-				/* Couldn't find next_mc, restart from the beginning :/ */
			
 
				-				mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-		}
			
 
				+restart:
			
 
				+	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				-		/* Remember where to try next */
			
 
				+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				+	     mc = next_mc)
			
 
				+	{
			
 
				+		/* mc hopefully gets out of the list, we thus need to prefetch
			
 
				+		 * the next element */
			
 
				 		next_mc = _starpu_mem_chunk_list_next(mc);
			
 
				-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				 		if (!force)
			
 
				 		{
			
 
				 			freed += try_to_free_mem_chunk(mc, node);
			
 
				 
			
 
				-			if (reclaim && freed > reclaim)
			
 
				+			if (reclaim && freed >= reclaim)
			
 
				 				break;
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			starpu_data_handle_t handle = mc->data;
			
 
				 
			
 
				-			_starpu_spin_lock(&handle->header_lock);
			
 
				+			if (_starpu_spin_trylock(&handle->header_lock))
			
 
				+			{
			
 
				+				/* Ergl. We are shutting down, but somebody is
			
 
				+				 * still locking the handle. That's not
			
 
				+				 * supposed to happen, but better be safe by
			
 
				+				 * letting it go through. */
			
 
				+				STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+				goto restart;
			
 
				+			}
			
 
				 
			
 
				 			/* We must free the memory now, because we are
			
 
				 			 * terminating the drivers: note that data coherency is
			
@@ -646,6 +649,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 		}
			
 
				 	}
			
 
				+	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				 	return freed;
			
 
				 }
			
@@ -768,8 +772,12 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
				 		_starpu_mem_chunk_delete(mc);
			
 
				 	}
			
 
				 	else
			
 
				+	{
			
 
				 		/* put it in the list of buffers to be removed */
			
 
				+		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				 		_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
			
 
				+		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/src/sched_policies/parallel_eager.c
+++ b/src/sched_policies/parallel_eager.c
@@ -179,7 +179,8 @@ static int push_task_peager_policy(struct starpu_task *task)
 
				 		worker = workers->get_next(workers, &it);
			
 
				 		int master = data->master_id[worker];
			
 
				 		/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
			
 
				-		if (starpu_worker_get_type(worker) != STARPU_CPU_WORKER  || master == worker)
			
 
				+		if (!starpu_worker_is_combined_worker(worker) &&
			
 
				+				starpu_worker_get_type(worker) != STARPU_CPU_WORKER  || master == worker)
			
 
				 		{
			
 
				 			starpu_pthread_mutex_t *sched_mutex;
			
 
				 			starpu_pthread_cond_t *sched_cond;