12 vuotta sitten · 8021d861b4
--- a/ChangeLog
+++ b/ChangeLog
@@ -36,6 +36,10 @@ New features:
 
				 Small features:
			
 
				   * Add cl_arg_free field to enable automatic free(cl_arg) on task
			
 
				     destroy.
			
 
				+  * New functions starpu_data_acquire_cb_sequential_consistency() and
			
 
				+    starpu_data_acquire_on_node_cb_sequential_consistency() which allows
			
 
				+    to enable or disable sequential consistency
			
 
				+
			
 
				 
			
 
				 StarPU 1.1.0 (svn revision xxxx)
			
 
				 ==============================================
			
@@ -150,6 +154,8 @@ Small features:
 
				   * New function starpu_get_version() to return as 3 integers the
			
 
				     release version of StarPU.
			
 
				   * Enable by default data allocation cache
			
 
				+  * Explicitly name the non-sleeping-non-running time "Overhead", and use
			
 
				+    another color in vite traces.
			
 
				 
			
 
				 Changes:
			
 
				   * Rename all filter functions to follow the pattern
			
--- a/configure.ac
+++ b/configure.ac
@@ -2160,9 +2160,9 @@ AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 
				 			[disable building of documentation])],
			
 
				 			enable_build_doc=$enableval, enable_build_doc=yes)
			
 
				 
			
 
				-# Check whether texi2dvi is installed
			
 
				-AC_PATH_PROG(texi2dvicommand, texi2dvi)
			
 
				-if test "$texi2dvicommand" = "" ; then
			
 
				+# Check whether doxygen is installed
			
 
				+AC_PATH_PROG(doxygencommand, doxygen)
			
 
				+if test "$doxygencommand" = "" ; then
			
 
				 	enable_build_doc="no"
			
 
				 fi
			
 
				 
			
@@ -2197,14 +2197,17 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
				 
			
 
				 # Create links to ICD files in build/socl/vendors directory. SOCL will use this
			
 
				 # directory as the OCL_ICD_VENDORS directory
			
 
				+SOCL_VENDORS="vendors/install/socl.icd"
			
 
				 for icd in /etc/OpenCL/vendors/*.icd ; do
			
 
				 	if test -f $icd ; then
			
 
				 	        if test "$(basename $icd)" != "socl.icd" ; then
			
 
				         		new_icd=$(basename $icd)
			
 
				 			AC_CONFIG_LINKS([socl/vendors/$new_icd:$icd])
			
 
				+			SOCL_VENDORS="$SOCL_VENDORS vendors/$new_icd"
			
 
				 		fi
			
 
				         fi
			
 
				 done
			
 
				+AC_SUBST(SOCL_VENDORS)
			
 
				 
			
 
				 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
			
 
				 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h gcc-plugin/include/starpu-gcc/config.h starpu-top/config.h)
			
@@ -2220,6 +2223,7 @@ AC_OUTPUT([
 
				 	socl/src/Makefile
			
 
				 	socl/examples/Makefile
			
 
				 	socl/vendors/socl.icd
			
 
				+	socl/vendors/install/socl.icd
			
 
				 	libstarpu.pc
			
 
				 	starpu-1.0.pc
			
 
				 	starpu-1.1.pc
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -42,8 +42,10 @@ chapters =	\
 
				 	chapters/fdl-1.3.doxy \
			
 
				 	chapters/scaling-vector-example.doxy \
			
 
				 	chapters/mic_scc_support.doxy \
			
 
				+	chapters/files.doxy \
			
 
				 	chapters/code/hello_pragma2.c \
			
 
				 	chapters/code/hello_pragma.c \
			
 
				+	chapters/code/scal_pragma.cu \
			
 
				 	chapters/code/matmul_pragma.c \
			
 
				 	chapters/code/matmul_pragma2.c \
			
 
				 	chapters/code/cholesky_pragma.c \
			
@@ -89,6 +91,9 @@ chapters =	\
 
				 	chapters/api/versioning.doxy \
			
 
				 	chapters/api/workers.doxy
			
 
				 
			
 
				+starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
			
 
				+	sed 's/#undef \(.*\)/#define \1 1/' $< > $@
			
 
				+
			
 
				 chapters/version.sty: $(chapters)
			
 
				 	@-for f in $(chapters) ; do \
			
 
				                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f 2>/dev/null ; fi \
			
@@ -134,6 +139,7 @@ EXTRA_DIST	= 		\
 
				 
			
 
				 dox_inputs = $(DOX_CONFIG) 				\
			
 
				 	$(chapters) 					\
			
 
				+	starpu_config.h					\
			
 
				 	chapters/version.sty				\
			
 
				 	chapters/version.html				\
			
 
				 	$(top_srcdir)/include/starpu.h			\
			
@@ -165,12 +171,19 @@ dox_inputs = $(DOX_CONFIG) 				\
 
				 	$(top_srcdir)/include/starpu_driver.h		\
			
 
				 	$(top_srcdir)/include/starpu_stdlib.h		\
			
 
				 	$(top_srcdir)/include/starpu_thread.h		\
			
 
				-	$(top_srcdir)/include/starpu_thread_util.h
			
 
				+	$(top_srcdir)/include/starpu_thread_util.h	\
			
 
				+	$(top_srcdir)/mpi/include/starpu_mpi.h 		\
			
 
				+	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor.h 		\
			
 
				+	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor_config.h 	\
			
 
				+	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor_lp.h		\
			
 
				+	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor_monitoring.h	\
			
 
				+	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor_policy.h
			
 
				 
			
 
				 $(DOX_TAG): $(dox_inputs)
			
 
				 	rm -fr $(DOX_HTML_DIR) $(DOX_LATEX_DIR)
			
 
				 	$(DOXYGEN) $(DOX_CONFIG)
			
 
				 	sed -i 's/ModuleDocumentation <\/li>/<a class="el" href="modules.html">Modules<\/a>/' html/index.html
			
 
				+	sed -i 's/FileDocumentation <\/li>/<a class="el" href="files.html">Files<\/a>/' html/index.html
			
 
				 
			
 
				 dist_pdf_DATA = $(DOX_PDF)
			
 
				 
			
@@ -181,6 +194,7 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 
				 	sed -i -e 's/__env__/\\_Environment Variables!/' -e 's/\\-\\_\\-\\-\\_\\-env\\-\\_\\-\\-\\_\\-//' ExecutionConfigurationThroughEnvironmentVariables.tex ;\
			
 
				 	sed -i -e 's/__configure__/\\_Configure Options!/' -e 's/\\-\\_\\-\\-\\_\\-configure\\-\\_\\-\\-\\_\\-//' CompilationConfiguration.tex ;\
			
 
				 	sed -i s'/\\item Module\\-Documentation/\\item \\hyperlink{ModuleDocumentation}{Module Documentation}/' index.tex ;\
			
 
				+	sed -i s'/\\item File\\-Documentation/\\item \\hyperlink{FileDocumentation}{File Documentation}/' index.tex ;\
			
 
				 	$(PDFLATEX) refman.tex; \
			
 
				 	$(MAKEINDEX) refman.idx;\
			
 
				 	$(PDFLATEX) refman.tex; \
			
@@ -195,7 +209,7 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 
				 	done; \
			
 
				 	mv refman.pdf ../$(DOX_PDF)
			
 
				 
			
 
				-CLEANFILES = $(DOX_TAG) \
			
 
				+CLEANFILES = $(DOX_TAG) starpu_config.h \
			
 
				     -r \
			
 
				     $(DOX_HTML_DIR) \
			
 
				     $(DOX_LATEX_DIR) \
			
--- a/doc/doxygen/chapters/advanced_examples.doxy
+++ b/doc/doxygen/chapters/advanced_examples.doxy
@@ -92,12 +92,12 @@ thus be very fast. The function starpu_cuda_get_device_properties()
 
				 provides a quick access to CUDA properties of CUDA devices to achieve
			
 
				 such efficiency.
			
 
				 
			
 
				-Another example is compiling CUDA code for various compute capabilities,
			
 
				+Another example is to compile CUDA code for various compute capabilities,
			
 
				 resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
			
 
				 1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
			
 
				-provided to StarPU by using <c>cuda_funcs</c>, and <c>can_execute</c> can then be
			
 
				-used to rule out the <c>scal_gpu_20</c> variant on a CUDA device which
			
 
				-will not be able to execute it:
			
 
				+provided to StarPU by using starpu_codelet::cuda_funcs, and
			
 
				+starpu_codelet::can_execute can then be used to rule out the
			
 
				+<c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
			
 
				 
			
 
				 \code{.c}
			
 
				 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
@@ -390,9 +390,9 @@ starpu_perfmodel::size_base however permits the application to
 
				 override that, when for instance some of the data do not matter for
			
 
				 task cost (e.g. mere reference table), or when using sparse
			
 
				 structures (in which case it is the number of non-zeros which matter), or when
			
 
				-there is some hidden parameter such as the number of iterations, etc. The
			
 
				-<c>examples/pi</c> examples uses this to include the number of iterations in the
			
 
				-base.
			
 
				+there is some hidden parameter such as the number of iterations, etc.
			
 
				+The example in the directory <c>examples/pi</c> uses this to include
			
 
				+the number of iterations in the base.
			
 
				 
			
 
				 How to use schedulers which can benefit from such performance model is explained
			
 
				 in \ref TaskSchedulingPolicy.
			
@@ -427,11 +427,11 @@ starpu_bound_print_lp() or starpu_bound_print_mps() can then be used
 
				 to output a Linear Programming problem corresponding to the schedule
			
 
				 of your tasks. Run it through <c>lp_solve</c> or any other linear
			
 
				 programming solver, and that will give you a lower bound for the total
			
 
				-execution time of your tasks. If StarPU was compiled with the glpk
			
 
				-library installed, starpu_bound_compute() can be used to solve it
			
 
				+execution time of your tasks. If StarPU was compiled with the library
			
 
				+<c>glpk</c> installed, starpu_bound_compute() can be used to solve it
			
 
				 immediately and get the optimized minimum, in ms. Its parameter
			
 
				 <c>integer</c> allows to decide whether integer resolution should be
			
 
				-computed and returned too.
			
 
				+computed and returned 
			
 
				 
			
 
				 The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
			
 
				 data, and tag dependencies into account. Tags released in a callback
			
@@ -549,7 +549,7 @@ STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
 
				 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
			
 
				 acquiring data <c>i</c> for the main application, and will execute the code
			
 
				 given as third parameter when it is acquired. In other words, as soon as the
			
 
				-value of <c>i</c> computed by the <c>which_index</c> codelet can be read, the
			
 
				+value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
			
 
				 portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
			
 
				 be executed, and is allowed to read from <c>i</c> to use it e.g. as an
			
 
				 index. Note that this macro is only avaible when compiling StarPU with
			
@@ -609,7 +609,7 @@ struct starpu_codelet accumulate_variable_cl =
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-and attaches them as reduction methods for its <c>dtq</c> handle:
			
 
				+and attaches them as reduction methods for its handle <c>dtq</c>:
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
			
@@ -674,7 +674,7 @@ tasks.
 
				 Data can sometimes be entirely produced by a task, and entirely consumed by
			
 
				 another task, without the need for other parts of the application to access
			
 
				 it. In such case, registration can be done without prior allocation, by using
			
 
				-the special -1 memory node number, and passing a zero pointer. StarPU will
			
 
				+the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
			
 
				 actually allocate memory only when the task creating the content gets scheduled,
			
 
				 and destroy it on unregistration.
			
 
				 
			
@@ -704,9 +704,8 @@ function, and free it at the end, but that would be costly. It could also
 
				 allocate one buffer per worker (similarly to \ref
			
 
				 HowToInitializeAComputationLibraryOnceForEachWorker), but that would
			
 
				 make them systematic and permanent. A more  optimized way is to use
			
 
				-the ::STARPU_SCRATCH data access mode, as examplified below,
			
 
				-
			
 
				-which provides per-worker buffers without content consistency.
			
 
				+the data access mode ::STARPU_SCRATCH, as examplified below, which
			
 
				+provides per-worker buffers without content consistency.
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
			
@@ -723,7 +722,7 @@ the other on the same worker. Also, if for instance GPU memory becomes scarce,
 
				 StarPU will notice that it can free such buffers easily, since the content does
			
 
				 not matter.
			
 
				 
			
 
				-The <c>examples/pi</c> example uses scratches for some temporary buffer.
			
 
				+The example <c>examples/pi</c> uses scratches for some temporary buffer.
			
 
				 
			
 
				 \section ParallelTasks Parallel Tasks
			
 
				 
			
@@ -734,8 +733,9 @@ parallel CPU implementation of the computation to be achieved. This can also be
 
				 useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
			
 
				 work collectively on a single task, the completion time of tasks on CPUs become
			
 
				 comparable to the completion time on GPUs, thus relieving from granularity
			
 
				-discrepancy concerns. Hwloc support needs to be enabled to get good performance,
			
 
				-otherwise StarPU will not know how to better group cores.
			
 
				+discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
			
 
				+good performance, otherwise StarPU will not know how to better group
			
 
				+cores.
			
 
				 
			
 
				 Two modes of execution exist to accomodate with existing usages.
			
 
				 
			
@@ -808,8 +808,8 @@ buffer.
 
				 
			
 
				 To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
			
 
				 be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
			
 
				-::STARPU_SPMD, the <c>pheft</c> (parallel-heft) and <c>peager</c>
			
 
				-(parallel eager) schedulers will indeed also try to execute tasks with
			
 
				+::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
			
 
				+(parallel eager) will indeed also try to execute tasks with
			
 
				 several CPUs. It will automatically try the various available combined
			
 
				 worker sizes (making several measurements for each worker size) and
			
 
				 thus be able to avoid choosing a large combined worker if the codelet
			
@@ -846,9 +846,9 @@ from different threads, due to the use of global variables in their sequential
 
				 sections for instance.
			
 
				 
			
 
				 The solution is then to use only one combined worker at a time.  This can be
			
 
				-done by setting the field starpu_conf::single_combined_worker to 1, or
			
 
				+done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
			
 
				 setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
			
 
				-to 1. StarPU will then run only one parallel task at a time (but other
			
 
				+to <c>1</c>. StarPU will then run only one parallel task at a time (but other
			
 
				 CPU and GPU tasks are not affected and can be run concurrently). The parallel
			
 
				 task scheduler will however still however still try varying combined worker
			
 
				 sizes to look for the most efficient ones.
			
@@ -1183,8 +1183,8 @@ directory <c>examples/basic_examples/dynamic_handles.c</c>.
 
				 
			
 
				 \section MoreExamples More Examples
			
 
				 
			
 
				-More examples are available in the StarPU sources in the <c>examples/</c>
			
 
				-directory. Simple examples include:
			
 
				+More examples are available in the StarPU sources in the directory
			
 
				+<c>examples/</c>. Simple examples include:
			
 
				 
			
 
				 <dl>
			
 
				 <dt> <c>incrementer/</c> </dt>
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -325,7 +325,7 @@ purposes.
 
				 This field has been made deprecated. One should use instead the
			
 
				 field starpu_task::handles to specify the data handles accessed
			
 
				 by the task. The access modes are now defined in the field
			
 
				-starpu_codelet::mode.
			
 
				+starpu_codelet::modes.
			
 
				 \var starpu_task::handles
			
 
				 Is an array of ::starpu_data_handle_t. It specifies the handles to the
			
 
				 different pieces of data accessed by the task. The number of entries
			
@@ -534,7 +534,7 @@ It is possible to initialize statically allocated tasks with
 
				 this value. This is equivalent to initializing a structure starpu_task
			
 
				 with the function starpu_task_init() function.
			
 
				 
			
 
				-\def STARPU_TASK_GET_HANDLE(struct starpu_task *task, int i)
			
 
				+\def STARPU_TASK_GET_HANDLE(task, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Return the \p i th data handle of the given task. If the task
			
 
				 is defined with a static or dynamic number of handles, will either
			
@@ -542,7 +542,7 @@ return the \p i th element of the field starpu_task::handles or the \p
 
				 i th element of the field starpu_task::dyn_handles (see \ref
			
 
				 SettingTheDataHandlesForATask)
			
 
				 
			
 
				-\def STARPU_TASK_SET_HANDLE(struct starpu_task *task, starpu_data_handle_t handle, int i)
			
 
				+\def STARPU_TASK_SET_HANDLE(task, handle, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Set the \p i th data handle of the given task with the given
			
 
				 dat handle. If the task is defined with a static or dynamic number of
			
@@ -551,7 +551,7 @@ starpu_task::handles or the \p i th element of the field
 
				 starpu_task::dyn_handles (see \ref
			
 
				 SettingTheDataHandlesForATask)
			
 
				 
			
 
				-\def STARPU_CODELET_GET_MODE(struct starpu_codelet *codelet, int i)
			
 
				+\def STARPU_CODELET_GET_MODE(codelet, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Return the access mode of the \p i th data handle of the given
			
 
				 codelet. If the codelet is defined with a static or dynamic number of
			
@@ -560,7 +560,7 @@ starpu_codelet::modes or the \p i th element of the field
 
				 starpu_codelet::dyn_modes (see \ref
			
 
				 SettingTheDataHandlesForATask)
			
 
				 
			
 
				-\def STARPU_CODELET_SET_MODE(struct starpu_codelet *codelet, enum starpu_data_access_mode mode, int i)
			
 
				+\def STARPU_CODELET_SET_MODE(codelet, mode, i)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Set the access mode of the \p i th data handle of the given
			
 
				 codelet. If the codelet is defined with a static or dynamic number of
			
@@ -569,7 +569,7 @@ starpu_codelet::modes or the \p i th element of the field
 
				 starpu_codelet::dyn_modes (see \ref
			
 
				 SettingTheDataHandlesForATask)
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_create(void)
			
 
				+\fn struct starpu_task *starpu_task_create(void)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Allocate a task structure and initialize it with default
			
 
				 values. Tasks allocated dynamically with starpu_task_create() are
			
@@ -580,7 +580,7 @@ wait) and thus freed at any time. If the field starpu_task::destroy is
 
				 explicitly unset, the resources used by the task have to be freed by
			
 
				 calling starpu_task_destroy().
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_dup(struct starpu_task *task)
			
 
				+\fn struct starpu_task *starpu_task_dup(struct starpu_task *task)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Allocate a task structure which is the exact duplicate of the
			
 
				 given task.
			
@@ -657,7 +657,7 @@ Return the number of submitted tasks which are ready for
 
				 execution are already executing. It thus does not include tasks
			
 
				 waiting for dependencies.
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_get_current(void)
			
 
				+\fn struct starpu_task *starpu_task_get_current(void)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 This function returns the task currently executed by the
			
 
				 worker, or <c>NULL</c> if it is called either from a thread that is not a
			
@@ -681,7 +681,7 @@ codelet implementation to be executed when executing the task.
 
				 This function return the codelet implementation to be executed
			
 
				 when executing the task.
			
 
				 
			
 
				-\fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps,	void (*callback)(void *), void *callback_arg)
			
 
				+\fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 This creates (and submits) an empty task that unlocks a tag once all
			
 
				 its dependencies are fulfilled.
			
--- a/doc/doxygen/chapters/api/cuda_extensions.doxy
+++ b/doc/doxygen/chapters/api/cuda_extensions.doxy
@@ -26,7 +26,7 @@ create its own streams. Synchronizing with cudaThreadSynchronize() is
 
				 allowed, but will reduce the likelihood of having all transfers
			
 
				 overlapped.
			
 
				 
			
 
				-\fn const struct cudaDeviceProp * starpu_cuda_get_device_properties(unsigned workerid)
			
 
				+\fn const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 This function returns a pointer to device properties for worker
			
 
				 \p workerid (assumed to be a CUDA worker).
			
@@ -35,11 +35,11 @@ This function returns a pointer to device properties for worker
 
				 \ingroup API_CUDA_Extensions
			
 
				 Report a CUDA error.
			
 
				 
			
 
				-\def STARPU_CUDA_REPORT_ERROR (cudaError_t status)
			
 
				+\def STARPU_CUDA_REPORT_ERROR(status)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 Calls starpu_cuda_report_error(), passing the current function, file and line position.
			
 
				 
			
 
				-\fn int starpu_cuda_copy_async_sync (void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+\fn int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 Copy \p ssize bytes from the pointer \p src_ptr on \p src_node
			
 
				 to the pointer \p dst_ptr on \p dst_node. The function first tries to
			
@@ -72,7 +72,7 @@ every CUDA device.
 
				 \ingroup API_CUDA_Extensions
			
 
				 Report a cublas error.
			
 
				 
			
 
				-\def STARPU_CUBLAS_REPORT_ERROR (cublasStatus status)
			
 
				+\def STARPU_CUBLAS_REPORT_ERROR(status)
			
 
				 \ingroup API_CUDA_Extensions
			
 
				 Calls starpu_cublas_report_error(), passing the current
			
 
				 function, file and line position.
			
--- a/doc/doxygen/chapters/api/data_interfaces.doxy
+++ b/doc/doxygen/chapters/api/data_interfaces.doxy
@@ -368,7 +368,7 @@ Return a device handle for the variable designated by
 
				 \p interface, to be used on OpenCL. The offset documented below has to be
			
 
				 used in addition to this.
			
 
				 
			
 
				-\def STARPU_VARIABLE_GET_OFFSET()
			
 
				+\def STARPU_VARIABLE_GET_OFFSET(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the offset in the variable designated by \p interface, to
			
 
				 be used with the device handle.
			
@@ -404,29 +404,29 @@ Return the size of each element of the array designated by \p handle.
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the local pointer associated with \p handle.
			
 
				 
			
 
				-\def STARPU_VECTOR_GET_PTR(void *interface)
			
 
				+\def STARPU_VECTOR_GET_PTR(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the array designated by \p interface, valid on
			
 
				 CPUs and CUDA only. For OpenCL, the device handle and offset need to
			
 
				 be used instead.
			
 
				 
			
 
				-\def STARPU_VECTOR_GET_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_VECTOR_GET_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the array designated by \p interface,
			
 
				 to be used on OpenCL. the offset documented below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				-\def STARPU_VECTOR_GET_OFFSET(void *interface)
			
 
				+\def STARPU_VECTOR_GET_OFFSET(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the offset in the array designated by \p interface, to be
			
 
				 used with the device handle.
			
 
				 
			
 
				-\def STARPU_VECTOR_GET_NX(void *interface)
			
 
				+\def STARPU_VECTOR_GET_NX(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements registered into the array
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_VECTOR_GET_ELEMSIZE(void *interface)
			
 
				+\def STARPU_VECTOR_GET_ELEMSIZE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the size of each element of the array designated by
			
 
				 \p interface.
			
@@ -479,39 +479,39 @@ Return the local pointer associated with \p handle.
 
				 Return the size of the elements registered into the matrix
			
 
				 designated by \p handle.
			
 
				 
			
 
				-\def STARPU_MATRIX_GET_PTR(void *interface)
			
 
				+\def STARPU_MATRIX_GET_PTR(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the matrix designated by \p interface, valid
			
 
				 on CPUs and CUDA devices only. For OpenCL devices, the device handle
			
 
				 and offset need to be used instead.
			
 
				 
			
 
				-\def STARPU_MATRIX_GET_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_MATRIX_GET_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the matrix designated by \p interface,
			
 
				 to be used on OpenCL. The offset documented below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				-\def STARPU_MATRIX_GET_OFFSET(void *interface)
			
 
				+\def STARPU_MATRIX_GET_OFFSET(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the offset in the matrix designated by \p interface, to be
			
 
				 used with the device handle.
			
 
				 
			
 
				-\def STARPU_MATRIX_GET_NX(void *interface)
			
 
				+\def STARPU_MATRIX_GET_NX(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements on the x-axis of the matrix
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_MATRIX_GET_NY(void *interface)
			
 
				+\def STARPU_MATRIX_GET_NY(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements on the y-axis of the matrix
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_MATRIX_GET_LD(void *interface)
			
 
				+\def STARPU_MATRIX_GET_LD(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements between each row of the matrix
			
 
				 designated by \p interface. May be equal to nx when there is no padding.
			
 
				 
			
 
				-\def STARPU_MATRIX_GET_ELEMSIZE(void *interface)
			
 
				+\def STARPU_MATRIX_GET_ELEMSIZE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the size of the elements registered into the matrix
			
 
				 designated by \p interface.
			
@@ -522,7 +522,7 @@ designated by \p interface.
 
				 \struct starpu_block_interface
			
 
				 Block interface for 3D dense blocks
			
 
				 \ingroup API_Data_Interfaces
			
 
				-\struct starpu_block_interface::id
			
 
				+\var starpu_block_interface::id
			
 
				 identifier of the interface
			
 
				 \var starpu_block_interface::ptr
			
 
				 local pointer of the block
			
@@ -577,48 +577,48 @@ Return the local pointer associated with \p handle.
 
				 Return the size of the elements of the block designated by
			
 
				 \p handle.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_PTR(void *interface)
			
 
				+\def STARPU_BLOCK_GET_PTR(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the block designated by \p interface.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_BLOCK_GET_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the block designated by \p interface,
			
 
				 to be used on OpenCL. The offset document below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_OFFSET(void *interface)
			
 
				+\def STARPU_BLOCK_GET_OFFSET(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the offset in the block designated by \p interface, to be
			
 
				 used with the device handle.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_NX(void *interface)
			
 
				+\def STARPU_BLOCK_GET_NX(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements on the x-axis of the block
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_NY(void *interface)
			
 
				+\def STARPU_BLOCK_GET_NY(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements on the y-axis of the block
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_NZ(void *interface)
			
 
				+\def STARPU_BLOCK_GET_NZ(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements on the z-axis of the block
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_LDY(void *interface)
			
 
				+\def STARPU_BLOCK_GET_LDY(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements between each row of the block
			
 
				 designated by \p interface. May be equal to nx when there is no padding.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_LDZ(void *interface)
			
 
				+\def STARPU_BLOCK_GET_LDZ(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of elements between each z plane of the block
			
 
				 designated by \p interface. May be equal to nx*ny when there is no
			
 
				 padding.
			
 
				 
			
 
				-\def STARPU_BLOCK_GET_ELEMSIZE(void *interface)
			
 
				+\def STARPU_BLOCK_GET_ELEMSIZE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the size of the elements of the block designated by
			
 
				 \p interface.
			
@@ -671,12 +671,12 @@ row pointers...) of the matrix desginated by \p handle.
 
				 Return a pointer to the non-zero values of the matrix
			
 
				 designated by \p handle.
			
 
				 
			
 
				-\fn uint32_t * starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
			
 
				+\fn uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the column index, which holds the positions
			
 
				 of the non-zero entries in the matrix designated by \p handle.
			
 
				 
			
 
				-\fn uint32_t * starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				+\fn uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the row pointer array of the matrix designated by
			
 
				 \p handle.
			
@@ -694,45 +694,45 @@ Return the numberof columns in a block.
 
				 Return the size of the elements in the matrix designated by
			
 
				 \p handle.
			
 
				 
			
 
				-\def STARPU_BCSR_GET_NNZ(void *interface)
			
 
				+\def STARPU_BCSR_GET_NNZ(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of non-zero values in the matrix designated
			
 
				 by \p interface.
			
 
				 
			
 
				-\def STARPU_BCSR_GET_NZVAL(void *interface)
			
 
				+\def STARPU_BCSR_GET_NZVAL(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the non-zero values of the matrix
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_BCSR_GET_NZVAL_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_BCSR_GET_NZVAL_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the array of non-zero values in the
			
 
				 matrix designated by \p interface. The offset documented below has to be
			
 
				 used in addition to this.
			
 
				 
			
 
				-\def STARPU_BCSR_GET_COLIND(void *interface)
			
 
				+\def STARPU_BCSR_GET_COLIND(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the column index of the matrix designated
			
 
				 by \p interface.
			
 
				 
			
 
				-\def STARPU_BCSR_GET_COLIND_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_BCSR_GET_COLIND_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the column index of the matrix
			
 
				 designated by \p interface. The offset documented below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				-\def STARPU_BCSR_GET_ROWPTR(void *interface)
			
 
				+\def STARPU_BCSR_GET_ROWPTR(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the row pointer array of the matrix
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_CSR_GET_ROWPTR_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_CSR_GET_ROWPTR_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the row pointer array of the matrix
			
 
				 designated by \p interface. The offset documented below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				-\def STARPU_BCSR_GET_OFFSET(void *interface)
			
 
				+\def STARPU_BCSR_GET_OFFSET
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the offset in the arrays (coling, rowptr, nzval) of the
			
 
				 matrix designated by \p interface, to be used with the device handles.
			
@@ -780,12 +780,12 @@ row pointers...) of the matrix designated by \p handle.
 
				 Return a local pointer to the non-zero values of the matrix
			
 
				 designated by \p handle.
			
 
				 
			
 
				-\fn uint32_t * starpu_csr_get_local_colind(starpu_data_handle_t handle)
			
 
				+\fn uint32_t *starpu_csr_get_local_colind(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a local pointer to the column index of the matrix
			
 
				 designated by \p handle.
			
 
				 
			
 
				-\fn uint32_t * starpu_csr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				+\fn uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a local pointer to the row pointer array of the matrix
			
 
				 designated by \p handle.
			
@@ -795,60 +795,60 @@ designated by \p handle.
 
				 Return the size of the elements registered into the matrix
			
 
				 designated by \p handle.
			
 
				 
			
 
				-\def STARPU_CSR_GET_NNZ(void *interface)
			
 
				+\def STARPU_CSR_GET_NNZ(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the number of non-zero values in the matrix designated
			
 
				 by \p interface.
			
 
				 
			
 
				-\def STARPU_CSR_GET_NROW(void *interface)
			
 
				+\def STARPU_CSR_GET_NROW(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the size of the row pointer array of the matrix
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_CSR_GET_NZVAL(void *interface)
			
 
				+\def STARPU_CSR_GET_NZVAL(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the non-zero values of the matrix
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_CSR_GET_NZVAL_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_CSR_GET_NZVAL_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the array of non-zero values in the
			
 
				 matrix designated by \p interface. The offset documented below has to be
			
 
				 used in addition to this.
			
 
				 
			
 
				-\def STARPU_CSR_GET_COLIND(void *interface)
			
 
				+\def STARPU_CSR_GET_COLIND(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the column index of the matrix designated
			
 
				 by \p interface.
			
 
				 
			
 
				-\def STARPU_CSR_GET_COLIND_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_CSR_GET_COLIND_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the column index of the matrix
			
 
				 designated by \p interface. The offset documented below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				-\def STARPU_CSR_GET_ROWPTR(void *interface)
			
 
				+\def STARPU_CSR_GET_ROWPTR(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the row pointer array of the matrix
			
 
				 designated by \p interface.
			
 
				 
			
 
				-\def STARPU_CSR_GET_ROWPTR_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_CSR_GET_ROWPTR_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the row pointer array of the matrix
			
 
				 designated by \p interface. The offset documented below has to be used in
			
 
				 addition to this.
			
 
				 
			
 
				-\def STARPU_CSR_GET_OFFSET(void *interface)
			
 
				+\def STARPU_CSR_GET_OFFSET
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the offset in the arrays (colind, rowptr, nzval) of the
			
 
				 matrix designated by \p interface, to be used with the device handles.
			
 
				 
			
 
				-\def STARPU_CSR_GET_FIRSTENTRY(void *interface)
			
 
				+\def STARPU_CSR_GET_FIRSTENTRY(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the index at which all arrays (the column indexes, the
			
 
				 row pointers...) of the \p interface start.
			
 
				 
			
 
				-\def STARPU_CSR_GET_ELEMSIZE(void *interface)
			
 
				+\def STARPU_CSR_GET_ELEMSIZE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the size of the elements registered into the matrix
			
 
				 designated by \p interface.
			
@@ -876,12 +876,12 @@ number of values registered in the matrix
 
				 \var starpu_coo_interface::elemsize
			
 
				 size of the elements of the matrix
			
 
				 
			
 
				-\def STARPU_COO_GET_COLUMNS(void *interface)
			
 
				+\def STARPU_COO_GET_COLUMNS(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a pointer to the column array of the matrix designated
			
 
				 by \p interface.
			
 
				 
			
 
				-\def STARPU_COO_GET_COLUMNS_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_COO_GET_COLUMNS_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the column array of the matrix
			
 
				 designated by \p interface, to be used on OpenCL. The offset documented
			
@@ -892,7 +892,7 @@ below has to be used in addition to this.
 
				 Return a pointer to the rows array of the matrix designated by
			
 
				 \p interface.
			
 
				 
			
 
				-\def STARPU_COO_GET_ROWS_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_COO_GET_ROWS_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the row array of the matrix
			
 
				 designated by \p interface, to be used on OpenCL. The offset documented
			
@@ -903,13 +903,13 @@ below has to be used in addition to this.
 
				 Return a pointer to the values array of the matrix designated
			
 
				 by \p interface.
			
 
				 
			
 
				-\def STARPU_COO_GET_VALUES_DEV_HANDLE(void *interface)
			
 
				+\def STARPU_COO_GET_VALUES_DEV_HANDLE(interface)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return a device handle for the value array of the matrix
			
 
				 designated by \p interface, to be used on OpenCL. The offset documented
			
 
				 below has to be used in addition to this.
			
 
				 
			
 
				-\def STARPU_COO_GET_OFFSET(void *interface)
			
 
				+\def STARPU_COO_GET_OFFSET
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Return the offset in the arrays of the COO matrix designated by
			
 
				 \p interface.
			
--- a/doc/doxygen/chapters/api/data_management.doxy
+++ b/doc/doxygen/chapters/api/data_management.doxy
@@ -226,6 +226,20 @@ are not disabled. Contrary to starpu_data_acquire(), this function is
 
				 non-blocking and may be called from task callbacks. Upon successful
			
 
				 completion, this function returns 0.
			
 
				 
			
 
				+\fn int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
			
 
				+\ingroup API_Data_Management
			
 
				+Equivalent of starpu_data_acquire_cb() with the possibility of enabling or disabling data dependencies.
			
 
				+When the data specified in \p handle is available in the appropriate access
			
 
				+mode, the \p callback function is executed. The application may access
			
 
				+the requested data during the execution of this \p callback. The \p callback
			
 
				+function must call starpu_data_release() once the application does not
			
 
				+need to access the piece of data anymore. Note that implicit data
			
 
				+dependencies are also enforced by starpu_data_acquire_cb_sequential_consistency() in case they
			
 
				+are not disabled specifically for the given \p handle or by the parameter \p sequential_consistency.
			
 
				+Similarly to starpu_data_acquire_cb(), this function is
			
 
				+non-blocking and may be called from task callbacks. Upon successful
			
 
				+completion, this function returns 0.
			
 
				+
			
 
				 \fn int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode)
			
 
				 \ingroup API_Data_Management
			
 
				 This is the same as starpu_data_acquire(), except that the data
			
@@ -237,7 +251,13 @@ This is the same as starpu_data_acquire_cb(), except that the
 
				 data will be available on the given memory node instead of main
			
 
				 memory.
			
 
				 
			
 
				-\def STARPU_DATA_ACQUIRE_CB(starpu_data_handle_t handle, enum starpu_data_access_mode mode, code)
			
 
				+\int int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
			
 
				+\ingroup API_Data_Management
			
 
				+This is the same as starpu_data_acquire_cb_sequential_consistency(), except that the
			
 
				+data will be available on the given memory node instead of main
			
 
				+memory.
			
 
				+
			
 
				+\def STARPU_DATA_ACQUIRE_CB(handle, mode, code)
			
 
				 \ingroup API_Data_Management
			
 
				 STARPU_DATA_ACQUIRE_CB() is the same as starpu_data_acquire_cb(),
			
 
				 except that the code to be executed in a callback is directly provided
			
--- a/doc/doxygen/chapters/api/data_partition.doxy
+++ b/doc/doxygen/chapters/api/data_partition.doxy
@@ -71,7 +71,7 @@ This function returns the number of children.
 
				 Return the ith child of the given \p handle, which must have been
			
 
				 partitionned beforehand.
			
 
				 
			
 
				-\fn starpu_data_handle_t starpu_data_get_sub_data (starpu_data_handle_t root_data, unsigned depth, ... )
			
 
				+\fn starpu_data_handle_t starpu_data_get_sub_data(starpu_data_handle_t root_data, unsigned depth, ... )
			
 
				 \ingroup API_Data_Partition
			
 
				 After partitioning a StarPU data by applying a filter,
			
 
				 starpu_data_get_sub_data() can be used to get handles for each of the
			
@@ -192,13 +192,13 @@ functions for block data. Examples on how to use them are shown in
 
				 <c>starpu_data_filters.h</c>. A usage example is available in
			
 
				 examples/filters/shadow3d.c
			
 
				 
			
 
				-\fn void starpu_block_filter_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the X dimension, thus getting
			
 
				 (x/\p nparts ,y,z) 3D matrices. If \p nparts does not divide x, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				-\fn void starpu_block_filter_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the X dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
@@ -207,13 +207,13 @@ divide x, the last submatrix contains the remainder. <b>IMPORTANT</b>:
 
				 This can only be used for read-only access, as no coherency is
			
 
				 enforced for the shadowed parts.
			
 
				 
			
 
				-\fn void starpu_block_filter_vertical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the Y dimension, thus getting
			
 
				 (x,y/\p nparts ,z) blocks. If \p nparts does not divide y, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				-\fn void starpu_block_filter_vertical_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the Y dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
@@ -222,13 +222,13 @@ divide y, the last submatrix contains the remainder. <b>IMPORTANT</b>:
 
				 This can only be used for read-only access, as no coherency is
			
 
				 enforced for the shadowed parts.
			
 
				 
			
 
				-\fn void starpu_block_filter_depth_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the Z dimension, thus getting
			
 
				 (x,y,z/\p nparts) blocks. If \p nparts does not divide z, the last
			
 
				 submatrix contains the remainder.
			
 
				 
			
 
				-\fn void starpu_block_filter_depth_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block along the Z dimension, with a
			
 
				 shadow border <c>filter_arg_ptr</c>, thus getting
			
@@ -245,11 +245,11 @@ functions for BCSR data. Examples on how to use them are shown in
 
				 \ref PartitioningData. The complete list can be found in the file
			
 
				 <c>starpu_data_filters.h</c>.
			
 
				 
			
 
				-\fn void starpu_bcsr_filter_canonical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block-sparse matrix into dense matrices.
			
 
				 
			
 
				-\fn void starpu_csr_filter_vertical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				+\fn void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
			
 
				 \ingroup API_Data_Partition
			
 
				 This partitions a block-sparse matrix into vertical
			
 
				 block-sparse matrices.
			
--- a/doc/doxygen/chapters/api/explicit_dependencies.doxy
+++ b/doc/doxygen/chapters/api/explicit_dependencies.doxy
@@ -99,7 +99,7 @@ This function releases the resources associated to tag \p id.
 
				 It can be called once the corresponding task has been executed and
			
 
				 when there is no other tag that depend on this tag anymore.
			
 
				 
			
 
				-\fn void starpu_tag_notify_from_apps (starpu_tag_t id)
			
 
				+\fn void starpu_tag_notify_from_apps(starpu_tag_t id)
			
 
				 \ingroup API_Explicit_Dependencies
			
 
				 This function explicitly unlocks tag \p id. It may be useful in
			
 
				 the case of applications which execute part of their computation
			
--- a/doc/doxygen/chapters/api/insert_task.doxy
+++ b/doc/doxygen/chapters/api/insert_task.doxy
@@ -90,7 +90,7 @@ Pack arguments of type ::STARPU_VALUE into a buffer which can be
 
				 given to a codelet and later unpacked with the function
			
 
				 starpu_codelet_unpack_args().
			
 
				 
			
 
				-\fn void starpu_codelet_unpack_args (void *cl_arg, ...)
			
 
				+\fn void starpu_codelet_unpack_args(void *cl_arg, ...)
			
 
				 \ingroup API_Insert_Task
			
 
				 Retrieve the arguments of type ::STARPU_VALUE associated to a
			
 
				 task automatically created using the function starpu_insert_task().
			
--- a/doc/doxygen/chapters/api/lower_bound.doxy
+++ b/doc/doxygen/chapters/api/lower_bound.doxy
@@ -11,36 +11,36 @@
 
				 \brief Compute theoretical upper computation efficiency bound
			
 
				 corresponding to some actual execution.
			
 
				 
			
 
				-\fn void starpu_bound_start (int deps, int prio)
			
 
				+\fn void starpu_bound_start(int deps, int prio)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Start recording tasks (resets stats). \p deps tells whether
			
 
				 dependencies should be recorded too (this is quite expensive)
			
 
				 
			
 
				-\fn void starpu_bound_stop (void)
			
 
				+\fn void starpu_bound_stop(void)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Stop recording tasks
			
 
				 
			
 
				-\fn void starpu_bound_print_dot (FILE *output)
			
 
				+\fn void starpu_bound_print_dot(FILE *output)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Print the DAG that was recorded
			
 
				 
			
 
				-\fn void starpu_bound_compute (double *res, double *integer_res, int integer)
			
 
				+\fn void starpu_bound_compute(double *res, double *integer_res, int integer)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Get theoretical upper bound (in ms) (needs glpk support
			
 
				 detected by configure script). It returns 0 if some performance models
			
 
				 are not calibrated.
			
 
				 
			
 
				-\fn void starpu_bound_print_lp (FILE *output)
			
 
				+\fn void starpu_bound_print_lp(FILE *output)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Emit the Linear Programming system on \p output for the recorded
			
 
				 tasks, in the lp format
			
 
				 
			
 
				-\fn void starpu_bound_print_mps (FILE *output)
			
 
				+\fn void starpu_bound_print_mps(FILE *output)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Emit the Linear Programming system on \p output for the recorded
			
 
				 tasks, in the mps format
			
 
				 
			
 
				-\fn void starpu_bound_print (FILE *output, int integer)
			
 
				+\fn void starpu_bound_print(FILE *output, int integer)
			
 
				 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
			
 
				 Emit statistics of actual execution vs theoretical upper bound.
			
 
				 \p integer permits to choose between integer solving (which takes a
			
--- a/doc/doxygen/chapters/api/mic_extensions.doxy
+++ b/doc/doxygen/chapters/api/mic_extensions.doxy
@@ -13,6 +13,10 @@
 
				 This macro is defined when StarPU has been installed with MIC support.
			
 
				 It should be used in your code to detect the availability of MIC.
			
 
				 
			
 
				+\typedef starpu_mic_func_symbol_t
			
 
				+\ingroup API_MIC_Extensions
			
 
				+Type for MIC function symbols
			
 
				+
			
 
				 \fn int starpu_mic_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name)
			
 
				 \ingroup API_MIC_Extensions
			
 
				 Initiate a lookup on each MIC device to find the adress of the
			
--- a/doc/doxygen/chapters/api/mpi.doxy
+++ b/doc/doxygen/chapters/api/mpi.doxy
@@ -11,21 +11,21 @@
 
				 @name Initialisation
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn int starpu_mpi_init (int *argc, char ***argv, int initialize_mpi)
			
 
				+\fn int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
			
 
				 \ingroup API_MPI_Support
			
 
				 Initializes the starpumpi library. \p initialize_mpi indicates if MPI
			
 
				 should be initialized or not by StarPU. If the value is not 0, MPI
			
 
				 will be initialized by calling <c>MPI_Init_Thread(argc, argv,
			
 
				 MPI_THREAD_SERIALIZED, ...)</c>.
			
 
				 
			
 
				-\fn int starpu_mpi_initialize (void)
			
 
				+\fn int starpu_mpi_initialize(void)
			
 
				 \deprecated
			
 
				 \ingroup API_MPI_Support
			
 
				 This function has been made deprecated. One should use instead the
			
 
				 function starpu_mpi_init(). This function does not call MPI_Init(), it
			
 
				 should be called beforehand.
			
 
				 
			
 
				-\fn int starpu_mpi_initialize_extended (int *rank, int *world_size)
			
 
				+\fn int starpu_mpi_initialize_extended(int *rank, int *world_size)
			
 
				 \deprecated
			
 
				 \ingroup API_MPI_Support
			
 
				 This function has been made deprecated. One should use instead the
			
@@ -33,13 +33,13 @@ function starpu_mpi_init(). MPI will be initialized by starpumpi by
 
				 calling <c>MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED,
			
 
				 ...)</c>.
			
 
				 
			
 
				-\fn int starpu_mpi_shutdown (void)
			
 
				+\fn int starpu_mpi_shutdown(void)
			
 
				 \ingroup API_MPI_Support
			
 
				 Cleans the starpumpi library. This must be called between calling
			
 
				 starpu_mpi functions and starpu_shutdown(). MPI_Finalize() will be
			
 
				 called if StarPU-MPI has been initialized by starpu_mpi_init().
			
 
				 
			
 
				-\fn void starpu_mpi_comm_amounts_retrieve (size_t *comm_amounts)
			
 
				+\fn void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
			
 
				 \ingroup API_MPI_Support
			
 
				 Retrieve the current amount of communications from the current node in
			
 
				 the array \p comm_amounts which must have a size greater or equal to
			
@@ -50,33 +50,33 @@ the world size. Communications statistics must be enabled (see
 
				 \anchor MPIPtpCommunication
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn int starpu_mpi_send (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
			
 
				+\fn int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Performs a standard-mode, blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm.
			
 
				 
			
 
				-\fn int starpu_mpi_recv (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
			
 
				+\fn int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
 
				 Performs a standard-mode, blocking receive in \p data_handle from the
			
 
				 node \p source using the message tag \p mpi_tag within the
			
 
				 communicator \p comm.
			
 
				 
			
 
				-\fn int starpu_mpi_isend (starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm)
			
 
				+\fn int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm. After the call, the pointer to the request \p req can be used to
			
 
				 test or to wait for the completion of the communication.
			
 
				 
			
 
				-\fn int starpu_mpi_irecv (starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm)
			
 
				+\fn int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm.
			
 
				 After the call, the pointer to the request \p req can be used to test
			
 
				 or to wait for the completion of the communication.
			
 
				 
			
 
				-\fn int starpu_mpi_isend_detached (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				+\fn int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
@@ -87,7 +87,7 @@ communication completes, its resources are automatically released back
 
				 to the system, there is no need to test or to wait for the completion
			
 
				 of the request.
			
 
				 
			
 
				-\fn int starpu_mpi_irecv_detached (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				+\fn int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm. On
			
@@ -98,34 +98,34 @@ communication completes, its resources are automatically released back
 
				 to the system, there is no need to test or to wait for the completion
			
 
				 of the request.
			
 
				 
			
 
				-\fn int starpu_mpi_wait (starpu_mpi_req *req, MPI_Status *status)
			
 
				+\fn int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
 
				 Returns when the operation identified by request \p req is complete.
			
 
				 
			
 
				-\fn int starpu_mpi_test (starpu_mpi_req *req, int *flag, MPI_Status *status)
			
 
				+\fn int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status)
			
 
				 \ingroup API_MPI_Support
			
 
				 If the operation identified by \p req is complete, set \p flag to 1.
			
 
				 The \p status object is set to contain information on the completed
			
 
				 operation.
			
 
				 
			
 
				-\fn int starpu_mpi_barrier (MPI_Comm comm)
			
 
				+\fn int starpu_mpi_barrier(MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Blocks the caller until all group members of the communicator \p comm
			
 
				 have called it.
			
 
				 
			
 
				-\fn int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				+\fn int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a standard-mode, non blocking send of \p data_handle to the node
			
 
				 \p dest using the message tag \p mpi_tag within the communicator \p
			
 
				 comm. On completion, \p tag is unlocked.
			
 
				 
			
 
				-\fn int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				+\fn int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts a nonblocking receive in \p data_handle from the node \p source
			
 
				 using the message tag \p mpi_tag within the communicator \p comm. On
			
 
				 completion, \p tag is unlocked.
			
 
				 
			
 
				-\fn int starpu_mpi_isend_array_detached_unlock_tag (unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				+\fn int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts \p array_size standard-mode, non blocking send. Each post sends
			
 
				 the n-th data of the array \p data_handle to the n-th node of the
			
@@ -133,7 +133,7 @@ array \p dest using the n-th message tag of the array \p mpi_tag
 
				 within the n-th communicator of the array \p comm. On completion of
			
 
				 the all the requests, \p tag is unlocked.
			
 
				 
			
 
				-\fn int starpu_mpi_irecv_array_detached_unlock_tag (unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				+\fn int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Posts \p array_size nonblocking receive. Each post receives in the n-th
			
 
				 data of the array \p data_handle from the n-th node of the array \p
			
@@ -144,14 +144,14 @@ requests, \p tag is unlocked.
 
				 @name Communication Cache
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn void starpu_mpi_cache_flush (MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+\fn void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 \ingroup API_MPI_Support
			
 
				 Clear the send and receive communication cache for the data
			
 
				 \p data_handle. The function has to be called synchronously by all the
			
 
				 MPI nodes. The function does nothing if the cache mechanism is
			
 
				 disabled (see \ref STARPU_MPI_CACHE).
			
 
				 
			
 
				-\fn void starpu_mpi_cache_flush_all_data (MPI_Comm comm)
			
 
				+\fn void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
			
 
				 \ingroup API_MPI_Support
			
 
				 Clear the send and receive communication cache for all data. The
			
 
				 function has to be called synchronously by all the MPI nodes. The
			
@@ -162,21 +162,21 @@ function does nothing if the cache mechanism is disabled (see
 
				 \anchor MPIInsertTask
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn int starpu_data_set_tag (starpu_data_handle_t handle, int tag)
			
 
				+\fn int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
			
 
				 \ingroup API_MPI_Support
			
 
				 Tell StarPU-MPI which MPI tag to use when exchanging the data.
			
 
				 
			
 
				-\fn int starpu_data_get_tag (starpu_data_handle_t handle)
			
 
				+\fn int starpu_data_get_tag(starpu_data_handle_t handle)
			
 
				 \ingroup API_MPI_Support
			
 
				 Returns the MPI tag to be used when exchanging the data.
			
 
				 
			
 
				-\fn int starpu_data_set_rank (starpu_data_handle_t handle, int rank)
			
 
				+\fn int starpu_data_set_rank(starpu_data_handle_t handle, int rank)
			
 
				 \ingroup API_MPI_Support
			
 
				 Tell StarPU-MPI which MPI node "owns" a given data, that is, the node
			
 
				 which will always keep an up-to-date value, and will by default
			
 
				 execute tasks which write to it.
			
 
				 
			
 
				-\fn int starpu_data_get_rank (starpu_data_handle_t handle)
			
 
				+\fn int starpu_data_get_rank(starpu_data_handle_t handle)
			
 
				 \ingroup API_MPI_Support
			
 
				 Returns the last value set by starpu_data_set_rank().
			
 
				 
			
@@ -192,7 +192,7 @@ this macro is used when calling starpu_mpi_insert_task(), and must be
 
				 followed by a data handle to specify that the node owning the given
			
 
				 data will execute the codelet.
			
 
				 
			
 
				-\fn int starpu_mpi_insert_task (MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				+\fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				 \ingroup API_MPI_Support
			
 
				 Create and submit a task corresponding to codelet with the following
			
 
				 arguments. The argument list must be zero-terminated.
			
@@ -230,13 +230,13 @@ The algorithm also includes a communication cache mechanism that
 
				 allows not to send data twice to the same MPI node, unless the data
			
 
				 has been modified. The cache can be disabled (see \ref STARPU_MPI_CACHE).
			
 
				 
			
 
				-\fn void starpu_mpi_get_data_on_node (MPI_Comm comm, starpu_data_handle_t data_handle, int node)
			
 
				+\fn void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
			
 
				 \ingroup API_MPI_Support
			
 
				 Transfer data \p data_handle to MPI node \p node, sending it from its
			
 
				 owner if needed. At least the target node and the owner have to call
			
 
				 the function.
			
 
				 
			
 
				-\fn void starpu_mpi_get_data_on_node_detached (MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
			
 
				+\fn void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Transfer data \p data_handle to MPI node \p node, sending it from its
			
 
				 owner if needed. At least the target node and the owner have to call
			
@@ -247,12 +247,12 @@ the argument \p arg.
 
				 \anchor MPICollectiveOperations
			
 
				 \ingroup API_MPI_Support
			
 
				 
			
 
				-\fn void starpu_mpi_redux_data (MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+\fn void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				 \ingroup API_MPI_Support
			
 
				 Perform a reduction on the given data. All nodes send the data to its
			
 
				 owner node which will perform a reduction.
			
 
				 
			
 
				-\fn int starpu_mpi_scatter_detached (starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				+\fn int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Scatter data among processes of the communicator based on the
			
 
				 ownership of the data. For each data of the array \p data_handles, the
			
@@ -263,7 +263,7 @@ called with the argument \p sarg on the process \p root, the \p
 
				 rcallback function is called with the argument \p rarg on any other
			
 
				 process.
			
 
				 
			
 
				-\fn int starpu_mpi_gather_detached (starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				+\fn int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				 \ingroup API_MPI_Support
			
 
				 Gather data from the different processes of the communicator onto the
			
 
				 process \p root. Each process owning data handle in the array
			
--- a/doc/doxygen/chapters/api/multiformat_data_interface.doxy
+++ b/doc/doxygen/chapters/api/multiformat_data_interface.doxy
@@ -52,19 +52,19 @@ when working on a CPU, and a structure of arrays when working on a
 
				 GPU. \p nobjects is the number of elements in the data. \p format_ops
			
 
				 describes the format.
			
 
				 
			
 
				-\def STARPU_MULTIFORMAT_GET_CPU_PTR(void *interface)
			
 
				+\def STARPU_MULTIFORMAT_GET_CPU_PTR(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				 returns the local pointer to the data with CPU format.
			
 
				 
			
 
				-\def STARPU_MULTIFORMAT_GET_CUDA_PTR(void *interface)
			
 
				+\def STARPU_MULTIFORMAT_GET_CUDA_PTR(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				 returns the local pointer to the data with CUDA format.
			
 
				 
			
 
				-\def STARPU_MULTIFORMAT_GET_OPENCL_PTR(void *interface)
			
 
				+\def STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				 returns the local pointer to the data with OpenCL format.
			
 
				 
			
 
				-\def STARPU_MULTIFORMAT_GET_NX (void *interface)
			
 
				+\def STARPU_MULTIFORMAT_GET_NX(interface)
			
 
				 \ingroup API_Multiformat_Data_Interface
			
 
				 returns the number of elements in the data.
			
 
				 
			
--- a/doc/doxygen/chapters/api/opencl_extensions.doxy
+++ b/doc/doxygen/chapters/api/opencl_extensions.doxy
@@ -82,11 +82,11 @@ starpu_opencl_program array by hand for more advanced use (e.g.
 
				 different programs on the different OpenCL devices, for relocation
			
 
				 purpose for instance).
			
 
				 
			
 
				-\fn int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options)
			
 
				+\fn int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 This function compiles an OpenCL source code stored in a file.
			
 
				 
			
 
				-\fn int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char* build_options)
			
 
				+\fn int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 This function compiles an OpenCL source code stored in a string.
			
 
				 
			
@@ -107,7 +107,7 @@ has been located on the system, \p located_dir_name the directory
 
				 where it has been located. Otherwise, they are both set to the empty
			
 
				 string.
			
 
				 
			
 
				-\fn int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char * build_options)
			
 
				+\fn int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Compile the OpenCL kernel stored in the file \p source_file_name
			
 
				 with the given options \p build_options and stores the result in the
			
@@ -116,7 +116,7 @@ directory <c>$STARPU_HOME/.starpu/opencl</c> with the same filename as
 
				 and the filename is suffixed with the vendor id and the device id of
			
 
				 the OpenCL device.
			
 
				 
			
 
				-\fn int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char*build_options)
			
 
				+\fn int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char *build_options)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Compile the OpenCL kernel in the string \p opencl_program_source
			
 
				 with the given options \p build_options and stores the result in the
			
@@ -158,7 +158,7 @@ consumed power).
 
				 @name OpenCL utilities
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 
			
 
				-\fn const char * starpu_opencl_error_string(cl_int status)
			
 
				+\fn const char *starpu_opencl_error_string(cl_int status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Return the error message in English corresponding to \p status, an OpenCL
			
 
				 error code.
			
@@ -169,7 +169,7 @@ Given a valid error status, prints the corresponding error message on
 
				 stdout, along with the given function name \p func, the given filename
			
 
				 \p file, the given line number \p line and the given message \p msg.
			
 
				 
			
 
				-\def STARPU_OPENCL_DISPLAY_ERROR(cl_int status)
			
 
				+\def STARPU_OPENCL_DISPLAY_ERROR(status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Call the function starpu_opencl_display_error() with the given error
			
 
				 \p status, the current function name, current file and line number,
			
@@ -179,13 +179,13 @@ and a empty message.
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Call the function starpu_opencl_display_error() and abort.
			
 
				 
			
 
				-\def STARPU_OPENCL_REPORT_ERROR (cl_int status)
			
 
				+\def STARPU_OPENCL_REPORT_ERROR(status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Call the function starpu_opencl_report_error() with the given error \p
			
 
				 status, with the current function name, current file and line number,
			
 
				 and a empty message.
			
 
				 
			
 
				-\def STARPU_OPENCL_REPORT_ERROR_WITH_MSG(const char *msg, cl_int status)
			
 
				+\def STARPU_OPENCL_REPORT_ERROR_WITH_MSG(msg, status)
			
 
				 \ingroup API_OpenCL_Extensions
			
 
				 Call the function starpu_opencl_report_error() with the given \p msg
			
 
				 and the given error \p status, with the current function name, current
			
--- a/doc/doxygen/chapters/api/parallel_tasks.doxy
+++ b/doc/doxygen/chapters/api/parallel_tasks.doxy
@@ -42,7 +42,7 @@ Get the description of a combined worker
 
				 Variant of starpu_worker_can_execute_task() compatible with combined
			
 
				 workers
			
 
				 
			
 
				-\fn void starpu_parallel_task_barrier_init(struct starpu_task*task, int workerid)
			
 
				+\fn void starpu_parallel_task_barrier_init(struct starpu_task *task, int workerid)
			
 
				 \ingroup API_Parallel_Tasks
			
 
				 Initialise the barrier for the parallel task, and dispatch the task
			
 
				 between the different combined workers.
			
--- a/doc/doxygen/chapters/api/performance_model.doxy
+++ b/doc/doxygen/chapters/api/performance_model.doxy
@@ -175,7 +175,7 @@ Used by ::STARPU_HISTORY_BASED and ::STARPU_NL_REGRESSION_BASED,
 
				 records all execution history measures.
			
 
				 \var starpu_perfmodel_per_arch::regression
			
 
				 \private
			
 
				-Used by ::STARPU_HISTORY_BASED and
			
 
				+Used by ::STARPU_REGRESSION_BASED and
			
 
				 ::STARPU_NL_REGRESSION_BASED, contains the estimated factors of the
			
 
				 regression.
			
 
				 
			
@@ -195,13 +195,13 @@ mean_n = 1/n sum
 
				 \var starpu_perfmodel_history_entry::deviation
			
 
				 n dev_n = sum2 - 1/n (sum)^2
			
 
				 \var starpu_perfmodel_history_entry::sum
			
 
				-num of samples
			
 
				+sum of samples (in µs)
			
 
				 \var starpu_perfmodel_history_entry::sum2
			
 
				 sum of samples^2
			
 
				 \var starpu_perfmodel_history_entry::nsample
			
 
				-todo
			
 
				+number of samples
			
 
				 \var starpu_perfmodel_history_entry::footprint
			
 
				-todo
			
 
				+data footprint
			
 
				 \var starpu_perfmodel_history_entry::size
			
 
				 in bytes
			
 
				 \var starpu_perfmodel_history_entry::flops
			
@@ -254,7 +254,7 @@ prints the affinity devices on \p f.
 
				 \fn void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
			
 
				 \ingroup API_Performance_Model
			
 
				 This feeds the performance model model with an explicit
			
 
				-measurement measured, in addition to measurements done by StarPU
			
 
				+measurement measured (in µs), in addition to measurements done by StarPU
			
 
				 itself. This can be useful when the application already has an
			
 
				 existing set of measurements done in good conditions, that StarPU
			
 
				 could benefit from instead of doing on-line measurements. And example
			
--- a/doc/doxygen/chapters/api/scc_extensions.doxy
+++ b/doc/doxygen/chapters/api/scc_extensions.doxy
@@ -13,6 +13,10 @@
 
				 This macro is defined when StarPU has been installed with SCC support.
			
 
				 It should be used in your code to detect the availability of SCC.
			
 
				 
			
 
				+\typedef starpu_scc_func_symbol_t
			
 
				+\ingroup API_SCC_Extensions
			
 
				+Type for SCC function symbols
			
 
				+
			
 
				 \fn int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name)
			
 
				 \ingroup API_SCC_Extensions
			
 
				 Initiate a lookup on each SCC device to find the adress of the
			
--- a/doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
+++ b/doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
@@ -114,7 +114,7 @@ performance counters to StarPU. By incrementing them, StarPU can help
 
				 the hypervisor in the resizing decision making process. TODO maybe
			
 
				 they should be hidden to the user
			
 
				 
			
 
				-\fn struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy * policy)
			
 
				+\fn struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy *policy)
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				 Initializes the hypervisor to use the strategy provided as parameter
			
 
				 and creates the performance counters (see starpu_sched_ctx_performance_counters).
			
@@ -148,7 +148,7 @@ flops the context will execute (needed for Gflops rate based strategy
 
				 see \ref ResizingStrategies or any other custom strategy needing it, for
			
 
				 the others we can pass 0.0)
			
 
				 
			
 
				-\fn void sc_hypervisor_unregister_ctx (unsigned sched_ctx)
			
 
				+\fn void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				 Unregister the context from the hypervisor.
			
 
				 
			
@@ -268,11 +268,11 @@ struct sc_hypervisor_policy dummy_policy =
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Moves workers from one context to another
			
 
				 
			
 
				-\fn struct sc_hypervisor_policy_config * sc_hypervisor_get_config(unsigned sched_ctx);
			
 
				+\fn struct sc_hypervisor_policy_config *sc_hypervisor_get_config(unsigned sched_ctx);
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Returns the configuration structure of a context
			
 
				 
			
 
				-\fn int * sc_hypervisor_get_sched_ctxs();
			
 
				+\fn int *sc_hypervisor_get_sched_ctxs();
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Gets the contexts managed by the hypervisor
			
 
				 
			
@@ -280,15 +280,15 @@ struct sc_hypervisor_policy dummy_policy =
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Gets the number of contexts managed by the hypervisor
			
 
				 
			
 
				-\fn struct sc_hypervisor_wrapper * sc_hypervisor_get_wrapper(unsigned sched_ctx);
			
 
				+\fn struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Returns the wrapper corresponding the context \p sched_ctx
			
 
				 
			
 
				-\fn double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper * sc_w);
			
 
				+\fn double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Returns the flops of a context elapsed from the last resize
			
 
				 
			
 
				-\fn char * sc_hypervisor_get_policy();
			
 
				+\fn char *sc_hypervisor_get_policy();
			
 
				 \ingroup API_Scheduling_Context_Hypervisor
			
 
				     Returns the name of the resizing policy the hypervisor uses
			
 
				 
			
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -212,7 +212,7 @@ policy of the given scheduler context.
 
				 @name Scheduling Context Worker Collection
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 
			
 
				-\fn struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type)
			
 
				+\fn struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Create a worker collection of the type indicated by the last parameter
			
 
				 for the context specified through the first parameter.
			
@@ -221,7 +221,7 @@ for the context specified through the first parameter.
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Delete the worker collection of the specified scheduling context
			
 
				 
			
 
				-\fn struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id)
			
 
				+\fn struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Return the worker collection managed by the indicated context
			
 
				 
			
--- a/doc/doxygen/chapters/api/scheduling_policy.doxy
+++ b/doc/doxygen/chapters/api/scheduling_policy.doxy
@@ -58,7 +58,7 @@ starpu_init().
 
				 \var starpu_sched_policy::policy_description
			
 
				         Optional field. Human readable description of the policy.
			
 
				 
			
 
				-\fn struct starpu_sched_policy ** starpu_sched_get_predefined_policies()
			
 
				+\fn struct starpu_sched_policy **starpu_sched_get_predefined_policies()
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 Return an NULL-terminated array of all the predefined scheduling
			
 
				 policies.
			
@@ -73,13 +73,13 @@ condition variable. For instance, in the case of a scheduling strategy
 
				 with a single task queue, the same condition variable would be used to
			
 
				 block and wake up all workers.
			
 
				 
			
 
				-\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void * policy_data)
			
 
				+\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 Each scheduling policy uses some specific data (queues, variables,
			
 
				 additional condition variables). It is memorize through a local
			
 
				 structure. This function assigns it to a scheduling context.
			
 
				 
			
 
				-\fn void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
			
 
				+\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 Returns the policy data previously assigned to a context
			
 
				 
			
@@ -135,7 +135,7 @@ otherwise the task may fail to execute.
 
				 \ingroup API_Scheduling_Policy
			
 
				 Return the current date in micro-seconds.
			
 
				 
			
 
				-\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task * task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 Returns the footprint for a given task
			
 
				 
			
--- a/doc/doxygen/chapters/api/task_bundles.doxy
+++ b/doc/doxygen/chapters/api/task_bundles.doxy
@@ -15,12 +15,12 @@ on the same worker whenever it’s possible. It must be considered as a
 
				 hint given to the scheduler as there is no guarantee that they will be
			
 
				 executed on the same worker.
			
 
				 
			
 
				-\fn void starpu_task_bundle_create (starpu_task_bundle_t *bundle)
			
 
				+\fn void starpu_task_bundle_create(starpu_task_bundle_t *bundle)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Factory function creating and initializing \p bundle, when the call
			
 
				 returns, memory needed is allocated and \p bundle is ready to use.
			
 
				 
			
 
				-\fn int starpu_task_bundle_insert (starpu_task_bundle_t bundle, struct starpu_task *task)
			
 
				+\fn int starpu_task_bundle_insert(starpu_task_bundle_t bundle, struct starpu_task *task)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Insert \p task in \p bundle. Until \p task is removed from \p bundle
			
 
				 its expected length and data transfer time will be considered along
			
@@ -30,7 +30,7 @@ On success, it returns 0. There are two cases of error : if \p bundle
 
				 is already closed it returns <c>-EPERM</c>, if \p task was already
			
 
				 submitted it returns <c>-EINVAL</c>.
			
 
				 
			
 
				-\fn int starpu_task_bundle_remove (starpu_task_bundle_t bundle, struct starpu_task *task)
			
 
				+\fn int starpu_task_bundle_remove(starpu_task_bundle_t bundle, struct starpu_task *task)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Remove \p task from \p bundle. Of course \p task must have been
			
 
				 previously inserted in \p bundle. This function must not be called if
			
@@ -38,21 +38,21 @@ previously inserted in \p bundle. This function must not be called if
 
				 so would result in undefined behaviour. On success, it returns 0. If
			
 
				 \p bundle is already closed it returns <c>-ENOENT</c>.
			
 
				 
			
 
				-\fn void starpu_task_bundle_close (starpu_task_bundle_t bundle)
			
 
				+\fn void starpu_task_bundle_close(starpu_task_bundle_t bundle)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Inform the runtime that the user will not modify \p bundle anymore, it
			
 
				 means no more inserting or removing task. Thus the runtime can destroy
			
 
				 it when possible.
			
 
				 
			
 
				-\fn double starpu_task_bundle_expected_length (starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+\fn double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Return the expected duration of \p bundle in micro-seconds.
			
 
				 
			
 
				-\fn double starpu_task_bundle_expected_power (starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				+\fn double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Return the expected power consumption of \p bundle in J.
			
 
				 
			
 
				-\fn double starpu_task_bundle_expected_data_transfer_time (starpu_task_bundle_t bundle, unsigned memory_node)
			
 
				+\fn double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node)
			
 
				 \ingroup API_Task_Bundles
			
 
				 Return the time (in micro-seconds) expected to transfer all data used within \p bundle.
			
 
				 
			
--- a/doc/doxygen/chapters/api/task_lists.doxy
+++ b/doc/doxygen/chapters/api/task_lists.doxy
@@ -28,11 +28,11 @@ Push \p task at the front of \p list
 
				 \ingroup API_Task_Lists
			
 
				 Push \p task at the back of \p list
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_front(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the front of \p list (without removing it)
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_back(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the back of \p list (without removing it)
			
 
				 
			
@@ -44,23 +44,23 @@ Test if \p list is empty
 
				 \ingroup API_Task_Lists
			
 
				 Remove \p task from \p list
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_pop_front(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Remove the element at the front of \p list
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_pop_back(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Remove the element at the back of \p list
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_begin(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the first task of \p list.
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_end(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_end(struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the end of \p list.
			
 
				 
			
 
				-\fn struct starpu_task * starpu_task_list_next(struct starpu_task *task)
			
 
				+\fn struct starpu_task *starpu_task_list_next(struct starpu_task *task)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the next task of \p list. This is not erase-safe.
			
 
				 
			
--- a/doc/doxygen/chapters/api/top.doxy
+++ b/doc/doxygen/chapters/api/top.doxy
@@ -9,62 +9,62 @@
 
				 /*! \defgroup API_StarPUTop_Interface StarPU-Top Interface
			
 
				 
			
 
				 \enum starpu_top_data_type
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Data type
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_BOOLEAN
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_INTEGER
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_data_type::STARPU_TOP_DATA_FLOAT
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 
			
 
				 \enum starpu_top_param_type
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Parameter type
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_BOOLEAN
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_INTEGER
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_FLOAT
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_param_type::STARPU_TOP_PARAM_ENUM
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 
			
 
				 \enum starpu_top_message_type
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 StarPU-Top Message type
			
 
				 \var starpu_top_message_type::TOP_TYPE_GO
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_SET
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_CONTINUE
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_ENABLE
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_DISABLE
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_DEBUG
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 \var starpu_top_message_type::TOP_TYPE_UNKNOW
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 todo
			
 
				 
			
 
				 \struct starpu_top_data
			
 
				 todo
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 \var starpu_top_data::id
			
 
				 todo
			
 
				 \var starpu_top_data::name
			
@@ -86,7 +86,7 @@ todo
 
				 
			
 
				 \struct starpu_top_param
			
 
				 todo
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 \var starpu_top_param::id
			
 
				 todo
			
 
				 \var starpu_top_param::name
			
@@ -113,98 +113,98 @@ todo
 
				 todo
			
 
				 
			
 
				 @name Functions to call before the initialisation
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 
			
 
				-\fn struct starpu_top_data *starpu_top_add_data_boolean(const char* data_name, int active)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name, int active)
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a data named data_name of type boolean.
			
 
				 If \p active=0, the value will NOT be displayed to user by default.
			
 
				 Any other value will make the value displayed by default.
			
 
				 
			
 
				-\fn struct starpu_top_data * starpu_top_add_data_integer(const char* data_name, int minimum_value, int maximum_value, int active)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_data *starpu_top_add_data_integer(const char *data_name, int minimum_value, int maximum_value, int active)
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a data named \p data_name of type integer. The
			
 
				 minimum and maximum value will be usefull to define the scale in UI.
			
 
				 If \p active=0, the value will NOT be displayed to user by default.
			
 
				 Any other value will make the value displayed by default.
			
 
				 
			
 
				-\fn struct starpu_top_data* starpu_top_add_data_float(const char* data_name, double minimum_value, double maximum_value, int active)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_data *starpu_top_add_data_float(const char *data_name, double minimum_value, double maximum_value, int active)
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a data named data_name of type float. The
			
 
				 minimum and maximum value will be usefull to define the scale in UI.
			
 
				 If \p active=0, the value will NOT be displayed to user by default.
			
 
				 Any other value will make the value displayed by default.
			
 
				 
			
 
				-\fn struct starpu_top_param* starpu_top_register_parameter_boolean(const char* param_name, int* parameter_field, void (*callback)(struct starpu_top_param*))
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name, int *parameter_field, void (*callback)(struct starpu_top_param*))
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a parameter named \p parameter_name, of type
			
 
				 boolean. The \p callback fonction will be called when the parameter is
			
 
				 modified by UI, and can be null.
			
 
				 
			
 
				-\fn struct starpu_top_param* starpu_top_register_parameter_float(const char* param_name, double* parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name, double *parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 his fonction register a parameter named \p param_name, of type
			
 
				 integer. Minimum and maximum value will be used to prevent user seting
			
 
				 incorrect value. The \p callback fonction will be called when the
			
 
				 parameter is modified by UI, and can be null.
			
 
				 
			
 
				-\fn struct starpu_top_param* starpu_top_register_parameter_integer(const char* param_name, int* parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name, int *parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*))
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a parameter named \p param_name, of type float.
			
 
				 Minimum and maximum value will be used to prevent user seting
			
 
				 incorrect value. The \p callback fonction will be called when the
			
 
				 parameter is modified by UI, and can be null.
			
 
				 
			
 
				-\fn struct starpu_top_param* starpu_top_register_parameter_enum(const char* param_name, int* parameter_field, char** values, int nb_values, void (*callback)(struct starpu_top_param*))
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\fn struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name, int *parameter_field, char **values, int nb_values, void (*callback)(struct starpu_top_param*))
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This fonction register a parameter named \p param_name, of type enum.
			
 
				 Minimum and maximum value will be used to prevent user seting
			
 
				 incorrect value. The \p callback fonction will be called when the
			
 
				 parameter is modified by UI, and can be null.
			
 
				 
			
 
				 @name Initialisation
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 
			
 
				 \fn void starpu_top_init_and_wait(const char *server_name)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function must be called when all parameters and data have been
			
 
				 registered AND initialised (for parameters). This function will wait
			
 
				 for a TOP to connect, send initialisation sentences, and wait for the
			
 
				 GO message.
			
 
				 
			
 
				 @name To call after initialisation
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 
			
 
				 \fn void starpu_top_update_parameter(const struct starpu_top_param *param)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function should be called after every modification of a parameter
			
 
				 from something other than starpu_top. This fonction notice UI that the
			
 
				 configuration changed.
			
 
				 
			
 
				 \fn void starpu_top_update_data_boolean(const struct starpu_top_data *data, int value)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function updates the value of the starpu_top_data on UI.
			
 
				 
			
 
				 \fn void starpu_top_update_data_integer(const struct starpu_top_data *data, int value)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function updates the value of the starpu_top_data on UI.
			
 
				 
			
 
				 \fn void starpu_top_update_data_float(const struct starpu_top_data *data, double value)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function updates the value of the starpu_top_data on UI.
			
 
				 
			
 
				 \fn void starpu_top_task_prevision(struct starpu_task *task, int devid, unsigned long long start, unsigned long long end)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function notifies UI than the task have been planed to run from start to end, on computation-core.
			
 
				 
			
 
				 \fn void starpu_top_debug_log(const char *message)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function is useful in debug mode. The starpu developper doesn't
			
 
				 need to check if the debug mode is active. This is checked by
			
 
				 starpu_top itsefl. It just send a message to display by UI.
			
 
				 
			
 
				 \fn void starpu_top_debug_lock(const char *message)
			
 
				-\ingroup API_StarPU-Top_Interface
			
 
				+\ingroup API_StarPUTop_Interface
			
 
				 This function is useful in debug mode. The starpu developper doesn't
			
 
				 need to check if the debug mode is active. This is checked by
			
 
				 starpu_top itsefl. It send a message and wait for a continue message
			
--- a/doc/doxygen/chapters/api/workers.doxy
+++ b/doc/doxygen/chapters/api/workers.doxy
@@ -99,7 +99,7 @@ The returned value should be at most \ref STARPU_MAXSCCDEVS.
 
				 This function returns the number of OpenCL devices controlled by
			
 
				 StarPU. The returned value should be at most \ref STARPU_MAXOPENCLDEVS.
			
 
				 
			
 
				-\fn int starpu_worker_get_id (void)
			
 
				+\fn int starpu_worker_get_id(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				 This function returns the identifier of the current worker, i.e
			
 
				 the one associated to the calling thread. The returned value is either
			
--- a/doc/doxygen/chapters/basic_examples.doxy
+++ b/doc/doxygen/chapters/basic_examples.doxy
@@ -12,7 +12,7 @@
 
				 
			
 
				 This section shows how to implement a simple program that submits a task
			
 
				 to StarPU using the StarPU C extension (\ref cExtensions). The complete example, and additional examples,
			
 
				-is available in the <c>gcc-plugin/examples</c> directory of the StarPU
			
 
				+is available in the directory <c>gcc-plugin/examples</c> of the StarPU
			
 
				 distribution. A similar example showing how to directly use the StarPU's API is shown
			
 
				 in \ref HelloWorldUsingStarPUAPI.
			
 
				 
			
@@ -24,7 +24,7 @@ has a single implementation for CPU:
 
				 
			
 
				 \snippet hello_pragma.c To be included
			
 
				 
			
 
				-The code can then be compiled and linked with GCC and the <c>-fplugin</c> flag:
			
 
				+The code can then be compiled and linked with GCC and the flag <c>-fplugin</c>:
			
 
				 
			
 
				 \verbatim
			
 
				 $ gcc `pkg-config starpu-1.2 --cflags` hello-starpu.c \
			
@@ -92,9 +92,9 @@ compiler implicitly do it as examplified above.
 
				 The field starpu_codelet::nbuffers specifies the number of data buffers that are
			
 
				 manipulated by the codelet: here the codelet does not access or modify any data
			
 
				 that is controlled by our data management library. Note that the argument
			
 
				-passed to the codelet (the field starpu_task::cl_arg) does not count
			
 
				-as a buffer since it is not managed by our data management library,
			
 
				-but just contain trivial parameters.
			
 
				+passed to the codelet (the parameter <c>cl_arg</c> of the function
			
 
				+<c>cpu_func</c>) does not count as a buffer since it is not managed by
			
 
				+our data management library, but just contain trivial parameters.
			
 
				 
			
 
				 \internal
			
 
				 TODO need a crossref to the proper description of "where" see bla for more ...
			
@@ -168,7 +168,7 @@ int main(int argc, char **argv)
 
				 \endcode
			
 
				 
			
 
				 Before submitting any tasks to StarPU, starpu_init() must be called. The
			
 
				-<c>NULL</c> argument specifies that we use default configuration. Tasks cannot
			
 
				+<c>NULL</c> argument specifies that we use the default configuration. Tasks cannot
			
 
				 be submitted after the termination of StarPU by a call to
			
 
				 starpu_shutdown().
			
 
				 
			
@@ -194,12 +194,13 @@ computational kernel that multiplies its input vector by a constant,
 
				 the constant could be specified by the means of this buffer, instead
			
 
				 of registering it as a StarPU data. It must however be noted that
			
 
				 StarPU avoids making copy whenever possible and rather passes the
			
 
				-pointer as such, so the buffer which is pointed at must kept allocated
			
 
				+pointer as such, so the buffer which is pointed at must be kept allocated
			
 
				 until the task terminates, and if several tasks are submitted with
			
 
				 various parameters, each of them must be given a pointer to their
			
 
				-buffer.	
			
 
				+own buffer.
			
 
				 
			
 
				-Once a task has been executed, an optional callback function is be called.
			
 
				+Once a task has been executed, an optional callback function
			
 
				+starpu_task::callback_func is called when defined.
			
 
				 While the computational kernel could be offloaded on various architectures, the
			
 
				 callback function is always executed on a CPU. The pointer
			
 
				 starpu_task::callback_arg is passed as an argument of the callback
			
@@ -211,7 +212,7 @@ void (*callback_function)(void *);
 
				 
			
 
				 If the field starpu_task::synchronous is non-zero, task submission
			
 
				 will be synchronous: the function starpu_task_submit() will not return
			
 
				-until the task was executed. Note that the function starpu_shutdown()
			
 
				+until the task has been executed. Note that the function starpu_shutdown()
			
 
				 does not guarantee that asynchronous tasks have been executed before
			
 
				 it returns, starpu_task_wait_for_all() can be used to that effect, or
			
 
				 data can be unregistered (starpu_data_unregister()), which will
			
@@ -237,12 +238,12 @@ we show how StarPU tasks can manipulate data.
 
				 
			
 
				 We will first show how to use the C language extensions provided by
			
 
				 the GCC plug-in (\ref cExtensions). The complete example, and
			
 
				-additional examples, is available in the <c>gcc-plugin/examples</c>
			
 
				-directory of the StarPU distribution. These extensions map directly
			
 
				+additional examples, is available in the directory <c>gcc-plugin/examples</c>
			
 
				+of the StarPU distribution. These extensions map directly
			
 
				 to StarPU's main concepts: tasks, task implementations for CPU,
			
 
				 OpenCL, or CUDA, and registered data buffers. The standard C version
			
 
				-that uses StarPU's standard C programming interface is given in the
			
 
				-next section (\ref VectorScalingUsingStarPUAPI).
			
 
				+that uses StarPU's standard C programming interface is given in \ref
			
 
				+VectorScalingUsingStarPUAPI.
			
 
				 
			
 
				 First of all, the vector-scaling task and its simple CPU implementation
			
 
				 has to be defined:
			
@@ -268,7 +269,7 @@ implemented:
 
				 
			
 
				 \snippet hello_pragma2.c To be included
			
 
				 
			
 
				-The <c>main</c> function above does several things:
			
 
				+The function <c>main</c> above does several things:
			
 
				 
			
 
				 <ul>
			
 
				 <li>
			
@@ -287,22 +288,20 @@ StarPU to transfer that memory region between GPUs and the main memory.
 
				 Removing this <c>pragma</c> is an error.
			
 
				 </li>
			
 
				 <li>
			
 
				-It invokes the <c>vector_scal</c> task.  The invocation looks the same
			
 
				+It invokes the task <c>vector_scal</c>.  The invocation looks the same
			
 
				 as a standard C function call.  However, it is an asynchronous
			
 
				 invocation, meaning that the actual call is performed in parallel with
			
 
				 the caller's continuation.
			
 
				 </li>
			
 
				 <li>
			
 
				-It waits for the termination of the <c>vector_scal</c>
			
 
				-asynchronous call.
			
 
				+It waits for the termination of the asynchronous call <c>vector_scal</c>.
			
 
				 </li>
			
 
				 <li>
			
 
				 Finally, StarPU is shut down.
			
 
				 </li>
			
 
				 </ul>
			
 
				 
			
 
				-The program can be compiled and linked with GCC and the <c>-fplugin</c>
			
 
				-flag:
			
 
				+The program can be compiled and linked with GCC and the flag <c>-fplugin</c>:
			
 
				 
			
 
				 \verbatim
			
 
				 $ gcc `pkg-config starpu-1.2 --cflags` vector_scal.c \
			
@@ -317,7 +316,7 @@ And voilà!
 
				 Now, this is all fine and great, but you certainly want to take
			
 
				 advantage of these newfangled GPUs that your lab just bought, don't you?
			
 
				 
			
 
				-So, let's add an OpenCL implementation of the <c>vector_scal</c> task.
			
 
				+So, let's add an OpenCL implementation of the task <c>vector_scal</c>.
			
 
				 We assume that the OpenCL kernel is available in a file,
			
 
				 <c>vector_scal_opencl_kernel.cl</c>, not shown here.  The OpenCL task
			
 
				 implementation is similar to that used with the standard C API
			
@@ -374,14 +373,14 @@ vector_scal_opencl (unsigned size, float vector[size], float factor)
 
				 \endcode
			
 
				 
			
 
				 The OpenCL kernel itself must be loaded from <c>main</c>, sometime after
			
 
				-the <c>initialize</c> pragma:
			
 
				+the pragma <c>initialize</c>:
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
			
 
				                                        &cl_programs, "");
			
 
				 \endcode
			
 
				 
			
 
				-And that's it.  The <c>vector_scal</c> task now has an additional
			
 
				+And that's it.  The task <c>vector_scal</c> now has an additional
			
 
				 implementation, for OpenCL, which StarPU's scheduler may choose to use
			
 
				 at run-time.  Unfortunately, the <c>vector_scal_opencl</c> above still
			
 
				 has to go through the common OpenCL boilerplate; in the future,
			
@@ -404,40 +403,13 @@ The actual implementation of the CUDA task goes into a separate
 
				 compilation unit, in a <c>.cu</c> file.  It is very close to the
			
 
				 implementation when using StarPU's standard C API (\ref DefinitionOfTheCUDAKernel).
			
 
				 
			
 
				-\code{.c}
			
 
				-/* CUDA implementation of the `vector_scal' task, to be compiled with `nvcc'. */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <stdlib.h>
			
 
				-
			
 
				-static __global__ void
			
 
				-vector_mult_cuda (unsigned n, float *val, float factor)
			
 
				-{
			
 
				-  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				-
			
 
				-  if (i < n)
			
 
				-    val[i] *= factor;
			
 
				-}
			
 
				-
			
 
				-/* Definition of the task implementation declared in the C file. */
			
 
				-extern "C" void
			
 
				-vector_scal_cuda (size_t size, float vector[], float factor)
			
 
				-{
			
 
				-  unsigned threads_per_block = 64;
			
 
				-  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
			
 
				-
			
 
				-  vector_mult_cuda <<< nblocks, threads_per_block, 0,
			
 
				-    starpu_cuda_get_local_stream () >>> (size, vector, factor);
			
 
				+\snippet scal_pragma.cu To be included
			
 
				 
			
 
				-  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				-}
			
 
				-\endcode
			
 
				-
			
 
				-The complete source code, in the <c>gcc-plugin/examples/vector_scal</c>
			
 
				-directory of the StarPU distribution, also shows how an SSE-specialized
			
 
				+The complete source code, in the directory <c>gcc-plugin/examples/vector_scal</c>
			
 
				+of the StarPU distribution, also shows how an SSE-specialized
			
 
				 CPU task implementation can be added.
			
 
				 
			
 
				-For more details on the C extensions provided by StarPU's GCC plug-in,
			
 
				+For more details on the C extensions provided by StarPU's GCC plug-in, see
			
 
				 \ref cExtensions.
			
 
				 
			
 
				 \section VectorScalingUsingStarPUAPI Vector Scaling Using StarPU's API
			
@@ -479,7 +451,7 @@ starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
 
				 The first argument, called the <b>data handle</b>, is an opaque pointer which
			
 
				 designates the array in StarPU. This is also the structure which is used to
			
 
				 describe which data is used by a task. The second argument is the node number
			
 
				-where the data originally resides. Here it is 0 since the <c>vector array</c> is in
			
 
				+where the data originally resides. Here it is 0 since the array <c>vector</c> is in
			
 
				 the main memory. Then comes the pointer <c>vector</c> where the data can be found in main memory,
			
 
				 the number of elements in the vector and the size of each element.
			
 
				 The following shows how to construct a StarPU task that will manipulate the
			
@@ -569,36 +541,9 @@ The CUDA implementation can be written as follows. It needs to be compiled with
 
				 a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
			
 
				 that the vector pointer returned by ::STARPU_VECTOR_GET_PTR is here a
			
 
				 pointer in GPU memory, so that it can be passed as such to the
			
 
				-<c>vector_mult_cuda</c> kernel call.
			
 
				-
			
 
				-\code{.c}
			
 
				-#include <starpu.h>
			
 
				+kernel call <c>vector_mult_cuda</c>.
			
 
				 
			
 
				-static __global__ void vector_mult_cuda(unsigned n, float *val,
			
 
				-                                        float factor)
			
 
				-{
			
 
				-    unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				-    if (i < n)
			
 
				-        val[i] *= factor;
			
 
				-}
			
 
				-
			
 
				-extern "C" void scal_cuda_func(void *buffers[], void *_args)
			
 
				-{
			
 
				-    float *factor = (float *)_args;
			
 
				-
			
 
				-    /* length of the vector */
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    /* CUDA copy of the vector pointer */
			
 
				-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				-    unsigned threads_per_block = 64;
			
 
				-    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				-
			
 
				-    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
			
 
				-                    (n, val, *factor);
			
 
				-
			
 
				-    cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-}
			
 
				-\endcode
			
 
				+\snippet vector_scal_cuda.cu To be included
			
 
				 
			
 
				 \subsection DefinitionOfTheOpenCLKernel Definition of the OpenCL Kernel
			
 
				 
			
@@ -620,55 +565,7 @@ which returns a <c>cl_mem</c> (which is not a device pointer, but an OpenCL
 
				 handle), which can be passed as such to the OpenCL kernel. The difference is
			
 
				 important when using partitioning, see \ref PartitioningData.
			
 
				 
			
 
				-\code{.c}
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-extern struct starpu_opencl_program programs;
			
 
				-
			
 
				-void scal_opencl_func(void *buffers[], void *_args)
			
 
				-{
			
 
				-    float *factor = _args;
			
 
				-    int id, devid, err;     /* OpenCL specific code */
			
 
				-    cl_kernel kernel;       /* OpenCL specific code */
			
 
				-    cl_command_queue queue; /* OpenCL specific code */
			
 
				-    cl_event event;         /* OpenCL specific code */
			
 
				-
			
 
				-    /* length of the vector */
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    /* OpenCL copy of the vector pointer */
			
 
				-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				-
			
 
				-    { /* OpenCL specific code */
			
 
				-        id = starpu_worker_get_id();
			
 
				-        devid = starpu_worker_get_devid(id);
			
 
				-
			
 
				-	err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
			
 
				-	                       "vector_mult_opencl", devid);   /* Name of the codelet defined above */
			
 
				-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-        err = clSetKernelArg(kernel, 0, sizeof(n), &n);
			
 
				-        err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
			
 
				-        err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				-        if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-    }
			
 
				-
			
 
				-    {  /* OpenCL specific code */
			
 
				-        size_t global=n;
			
 
				-        size_t local=1;
			
 
				-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-    }
			
 
				-
			
 
				-    {  /* OpenCL specific code */
			
 
				-        clFinish(queue);
			
 
				-        starpu_opencl_collect_stats(event);
			
 
				-        clReleaseEvent(event);
			
 
				-
			
 
				-        starpu_opencl_release_kernel(kernel);
			
 
				-    }
			
 
				-}
			
 
				-\endcode
			
 
				-
			
 
				+\snippet vector_scal_opencl.c To be included
			
 
				 
			
 
				 \subsection DefinitionOfTheMainCode Definition of the Main Code
			
 
				 
			
--- a/doc/doxygen/chapters/building.doxy
+++ b/doc/doxygen/chapters/building.doxy
@@ -134,8 +134,8 @@ $ make install
 
				 \endverbatim
			
 
				 
			
 
				 Libtool interface versioning information are included in
			
 
				-libraries names (libstarpu-1.2.so, libstarpumpi-1.2.so and
			
 
				-libstarpufft-1.2.so).
			
 
				+libraries names (<c>libstarpu-1.2.so</c>, <c>libstarpumpi-1.2.so</c> and
			
 
				+<c>libstarpufft-1.2.so</c>).
			
 
				 
			
 
				 \section SettingUpYourOwnCode Setting up Your Own Code
			
 
				 
			
@@ -145,10 +145,10 @@ StarPU provides a pkg-config executable to obtain relevant compiler
 
				 and linker flags.
			
 
				 Compiling and linking an application against StarPU may require to use
			
 
				 specific flags or libraries (for instance <c>CUDA</c> or <c>libspe2</c>).
			
 
				-To this end, it is possible to use the <c>pkg-config</c> tool.
			
 
				+To this end, it is possible to use the tool <c>pkg-config</c>.
			
 
				 
			
 
				 If StarPU was not installed at some standard location, the path of StarPU's
			
 
				-library must be specified in the <c>PKG_CONFIG_PATH</c> environment variable so
			
 
				+library must be specified in the environment variable <c>PKG_CONFIG_PATH</c> so
			
 
				 that <c>pkg-config</c> can find it. For example if StarPU was installed in
			
 
				 <c>$prefix_dir</c>:
			
 
				 
			
@@ -175,10 +175,10 @@ Make sure that <c>pkg-config --libs starpu-1.2</c> actually produces some output
 
				 before going further: <c>PKG_CONFIG_PATH</c> has to point to the place where
			
 
				 <c>starpu-1.2.pc</c> was installed during <c>make install</c>.
			
 
				 
			
 
				-Also pass the <c>--static</c> option if the application is to be
			
 
				+Also pass the option <c>--static</c> if the application is to be
			
 
				 linked statically.
			
 
				 
			
 
				-It is also necessary to set the variable <c>LD_LIBRARY_PATH</c> to
			
 
				+It is also necessary to set the environment variable <c>LD_LIBRARY_PATH</c> to
			
 
				 locate dynamic libraries at runtime.
			
 
				 
			
 
				 \verbatim
			
@@ -283,10 +283,10 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 
				 
			
 
				 \subsection CholeskyFactorization Cholesky Factorization
			
 
				 
			
 
				-<c>cholesky\*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
			
 
				+<c>cholesky/*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
			
 
				 
			
 
				 \subsection LUFactorization LU Factorization
			
 
				 
			
 
				-<c>lu\*</c> perform an LU factorization. They use different dependency primitives.
			
 
				+<c>lu/*</c> perform an LU factorization. They use different dependency primitives.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/code/scal_pragma.cu
+++ b/doc/doxygen/chapters/code/scal_pragma.cu
@@ -0,0 +1,45 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+//! [To be included]
			
 
				+/* CUDA implementation of the `vector_scal' task, to be compiled with `nvcc'. */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+static __global__ void
			
 
				+vector_mult_cuda (unsigned n, float *val, float factor)
			
 
				+{
			
 
				+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+
			
 
				+  if (i < n)
			
 
				+    val[i] *= factor;
			
 
				+}
			
 
				+
			
 
				+/* Definition of the task implementation declared in the C file. */
			
 
				+extern "C" void
			
 
				+vector_scal_cuda (size_t size, float vector[], float factor)
			
 
				+{
			
 
				+  unsigned threads_per_block = 64;
			
 
				+  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
			
 
				+
			
 
				+  vector_mult_cuda <<< nblocks, threads_per_block, 0,
			
 
				+    starpu_cuda_get_local_stream () >>> (size, vector, factor);
			
 
				+
			
 
				+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				+}
			
 
				+//! [To be included]
			
--- a/doc/doxygen/chapters/code/vector_scal_opencl.c
+++ b/doc/doxygen/chapters/code/vector_scal_opencl.c
@@ -24,29 +24,33 @@ extern struct starpu_opencl_program programs;
 
				 void scal_opencl_func(void *buffers[], void *_args)
			
 
				 {
			
 
				     float *factor = _args;
			
 
				-    int id, devid, err;
			
 
				-    cl_kernel kernel;
			
 
				-    cl_command_queue queue;
			
 
				-    cl_event event;
			
 
				+    int id, devid, err;                   /* OpenCL specific code */
			
 
				+    cl_kernel kernel;                     /* OpenCL specific code */
			
 
				+    cl_command_queue queue;               /* OpenCL specific code */
			
 
				+    cl_event event;                       /* OpenCL specific code */
			
 
				 
			
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				     /* OpenCL copy of the vector pointer */
			
 
				     cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				 
			
 
				-    id = starpu_worker_get_id();
			
 
				-    devid = starpu_worker_get_devid(id);
			
 
				+    {  /* OpenCL specific code */
			
 
				+	 id = starpu_worker_get_id();
			
 
				+	 devid = starpu_worker_get_devid(id);
			
 
				 
			
 
				-    err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl",
			
 
				-                                    devid);
			
 
				-    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	 err = starpu_opencl_load_kernel(&kernel, &queue,
			
 
				+					 &programs,
			
 
				+					 "vector_mult_opencl", /* Name of the codelet */
			
 
				+					 devid);
			
 
				+	 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-    err = clSetKernelArg(kernel, 0, sizeof(n), &n);
			
 
				-    err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
			
 
				-    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				-    if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	 err = clSetKernelArg(kernel, 0, sizeof(n), &n);
			
 
				+	 err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
			
 
				+	 err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				+	 if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+    }
			
 
				 
			
 
				-    {
			
 
				+    {   /* OpenCL specific code */
			
 
				         size_t global=n;
			
 
				         size_t local;
			
 
				         size_t s;
			
@@ -63,10 +67,12 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				     }
			
 
				 
			
 
				-    clFinish(queue);
			
 
				-    starpu_opencl_collect_stats(event);
			
 
				-    clReleaseEvent(event);
			
 
				+    {  /* OpenCL specific code */
			
 
				+	 clFinish(queue);
			
 
				+	 starpu_opencl_collect_stats(event);
			
 
				+	 clReleaseEvent(event);
			
 
				 
			
 
				-    starpu_opencl_release_kernel(kernel);
			
 
				+	 starpu_opencl_release_kernel(kernel);
			
 
				+    }
			
 
				 }
			
 
				 //! [To be included]
			
--- a/doc/doxygen/chapters/environment_variables.doxy
+++ b/doc/doxygen/chapters/environment_variables.doxy
@@ -150,15 +150,15 @@ and \ref STARPU_MAX_WORKERSIZE can be used to change this default.
 
				 <dd>
			
 
				 \anchor STARPU_MIN_WORKERSIZE
			
 
				 \addindex __env__STARPU_MIN_WORKERSIZE
			
 
				-When \ref STARPU_SINGLE_COMBINED_WORKER is set, \ref STARPU_MIN_WORKERSIZE
			
 
				-permits to specify the minimum size of the combined workers (instead of the default 1)
			
 
				+\ref STARPU_MIN_WORKERSIZE
			
 
				+permits to specify the minimum size of the combined workers (instead of the default 2)
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_MAX_WORKERSIZE</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_MAX_WORKERSIZE
			
 
				 \addindex __env__STARPU_MAX_WORKERSIZE
			
 
				-When \ref STARPU_SINGLE_COMBINED_WORKER is set, \ref STARPU_MAX_WORKERSIZE
			
 
				+\ref STARPU_MAX_WORKERSIZE
			
 
				 permits to specify the minimum size of the combined workers (instead of the
			
 
				 number of CPU workers in the system)
			
 
				 </dd>
			
@@ -326,6 +326,17 @@ SOCL_OCL_LIB_OPENCL is defined. It should contain the location
 
				 of the file <c>libOpenCL.so</c> of the OCL ICD implementation.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>OCL_ICD_VENDORS</dt>
			
 
				+<dd>
			
 
				+\anchor OCL_ICD_VENDORS
			
 
				+\addindex __env__OCL_ICD_VENDORS
			
 
				+When using SOCL with OpenCL ICD
			
 
				+(https://forge.imag.fr/projects/ocl-icd/), this variable may be used
			
 
				+to point to the directory where ICD files are installed. The default
			
 
				+directory is <c>/etc/OpenCL/vendors</c>. StarPU installs ICD
			
 
				+files in the directory <c>$prefix/share/starpu/opencl/vendors</c>.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_COMM_STATS</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_COMM_STATS
			
--- a/doc/doxygen/chapters/fft_support.doxy
+++ b/doc/doxygen/chapters/fft_support.doxy
@@ -9,7 +9,7 @@
 
				 /*! \page FFTSupport FFT Support
			
 
				 
			
 
				 StarPU provides <c>libstarpufft</c>, a library whose design is very similar to
			
 
				-both fftw and cufft, the difference being that it takes benefit from both CPUs
			
 
				+both <c>fftw</c> and <c>cufft</c>, the difference being that it takes benefit from both CPUs
			
 
				 and GPUs. It should however be noted that GPUs do not have the same precision as
			
 
				 CPUs, so the results may different by a negligible amount.
			
 
				 
			
@@ -33,7 +33,7 @@ The documentation below is given with names for double precision, replace
 
				 
			
 
				 Only complex numbers are supported at the moment.
			
 
				 
			
 
				-The application has to call starpu_init() before calling starpufft functions.
			
 
				+The application has to call starpu_init() before calling <c>starpufft</c> functions.
			
 
				 
			
 
				 Either main memory pointers or data handles can be provided.
			
 
				 
			
@@ -66,6 +66,6 @@ $ pkg-config --cflags starpufft-1.2  # options for the compiler
 
				 $ pkg-config --libs starpufft-1.2    # options for the linker
			
 
				 \endverbatim
			
 
				 
			
 
				-Also pass the <c>--static</c> option if the application is to be linked statically.
			
 
				+Also pass the option <c>--static</c> if the application is to be linked statically.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/files.doxy
+++ b/doc/doxygen/chapters/files.doxy
@@ -0,0 +1,50 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+*/
			
 
				+
			
 
				+/*! \page Files Files
			
 
				+
			
 
				+\file starpu_deprecated_api.h
			
 
				+\file starpu.h
			
 
				+\file starpu_data_filters.h
			
 
				+\file starpu_data_interfaces.h
			
 
				+\file starpu_worker.h
			
 
				+\file starpu_task.h
			
 
				+\file starpu_task_bundle.h
			
 
				+\file starpu_task_list.h
			
 
				+\file starpu_task_util.h
			
 
				+\file starpu_data.h
			
 
				+\file starpu_perfmodel.h
			
 
				+\file starpu_util.h
			
 
				+\file starpu_fxt.h
			
 
				+\file starpu_cuda.h
			
 
				+\file starpu_opencl.h
			
 
				+\file starpu_sink.h
			
 
				+\file starpu_mic.h
			
 
				+\file starpu_scc.h
			
 
				+\file starpu_expert.h
			
 
				+\file starpu_profiling.h
			
 
				+\file starpu_bound.h
			
 
				+\file starpu_scheduler.h
			
 
				+\file starpu_sched_ctx.h
			
 
				+\file starpu_top.h
			
 
				+\file starpu_hash.h
			
 
				+\file starpu_rand.h
			
 
				+\file starpu_cublas.h
			
 
				+\file starpu_driver.h
			
 
				+\file starpu_stdlib.h
			
 
				+\file starpu_thread.h
			
 
				+\file starpu_thread_util.h
			
 
				+\file starpu_mpi.h
			
 
				+\file sc_hypervisor.h
			
 
				+\file sc_hypervisor_config.h
			
 
				+\file sc_hypervisor_lp.h
			
 
				+\file sc_hypervisor_monitoring.h
			
 
				+\file sc_hypervisor_policy.h
			
 
				+\file starpu_config.h
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/introduction.doxy
+++ b/doc/doxygen/chapters/introduction.doxy
@@ -227,6 +227,7 @@ The documentation chapters include
 
				 <li> \ref ExecutionConfigurationThroughEnvironmentVariables
			
 
				 <li> \ref CompilationConfiguration
			
 
				 <li> \ref ModuleDocumentation
			
 
				+<li> \ref FileDocumentation
			
 
				 <li> \ref deprecated
			
 
				 </ul>
			
 
				 <li> Part: Appendix
			
--- a/doc/doxygen/chapters/mpi_support.doxy
+++ b/doc/doxygen/chapters/mpi_support.doxy
@@ -31,7 +31,7 @@ $ pkg-config --cflags starpumpi-1.2  # options for the compiler
 
				 $ pkg-config --libs starpumpi-1.2    # options for the linker
			
 
				 \endverbatim
			
 
				 
			
 
				-You also need pass the <c>--static</c> option if the application is to
			
 
				+You also need pass the option <c>--static</c> if the application is to
			
 
				 be linked statically.
			
 
				 
			
 
				 \code{.c}
			
@@ -257,7 +257,7 @@ int my_distrib(int x, int y, int nb_nodes) {
 
				 
			
 
				 Now the data can be registered within StarPU. Data which are not
			
 
				 owned but will be needed for computations can be registered through
			
 
				-the lazy allocation mechanism, i.e. with a <c>home_node</c> set to -1.
			
 
				+the lazy allocation mechanism, i.e. with a <c>home_node</c> set to <c>-1</c>.
			
 
				 StarPU will automatically allocate the memory when it is used for the
			
 
				 first time.
			
 
				 
			
--- a/doc/doxygen/chapters/optimize_performance.doxy
+++ b/doc/doxygen/chapters/optimize_performance.doxy
@@ -37,7 +37,7 @@ starpu_data_set_wt_mask(img_handle, 1<<0);
 
				 \endcode
			
 
				 
			
 
				 will for instance request to always automatically transfer a replicate into the
			
 
				-main memory (node 0), as bit 0 of the write-through bitmask is being set.
			
 
				+main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_data_set_wt_mask(img_handle, ~0U);
			
@@ -108,7 +108,7 @@ possibility according to task size, one can run
 
				 speedup of independent tasks of very small sizes.
			
 
				 
			
 
				 The choice of scheduler also has impact over the overhead: for instance, the
			
 
				-<c>dmda</c> scheduler takes time to make a decision, while <c>eager</c> does
			
 
				+ scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
			
 
				 not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
			
 
				 impact that has on the target machine.
			
 
				 
			
@@ -132,7 +132,7 @@ priority information to StarPU.
 
				 
			
 
				 \section TaskSchedulingPolicy Task Scheduling Policy
			
 
				 
			
 
				-By default, StarPU uses the <c>eager</c> simple greedy scheduler. This is
			
 
				+By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
			
 
				 because it provides correct load balance even if the application codelets do not
			
 
				 have performance models. If your application codelets have performance models
			
 
				 (\ref PerformanceModelExample), you should change the scheduler thanks
			
@@ -276,14 +276,14 @@ and in Joules for the energy consumption models.
 
				 
			
 
				 Distributing tasks to balance the load induces data transfer penalty. StarPU
			
 
				 thus needs to find a balance between both. The target function that the
			
 
				-<c>dmda</c> scheduler of StarPU
			
 
				+scheduler <c>dmda</c> of StarPU
			
 
				 tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
			
 
				 <c>T_execution</c> is the estimated execution time of the codelet (usually
			
 
				 accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
			
 
				 latter is estimated based on bus calibration before execution start,
			
 
				 i.e. with an idle machine, thus without contention. You can force bus
			
 
				 re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
			
 
				-beta parameter defaults to 1, but it can be worth trying to tweak it
			
 
				+beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
			
 
				 by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
			
 
				 real application execution, contention makes transfer times bigger.
			
 
				 This is of course imprecise, but in practice, a rough estimation
			
@@ -291,7 +291,7 @@ already gives the good results that a precise estimation would give.
 
				 
			
 
				 \section DataPrefetch Data Prefetch
			
 
				 
			
 
				-The <c>heft</c>, <c>dmda</c> and <c>pheft</c> scheduling policies
			
 
				+The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
			
 
				 perform data prefetch (see \ref STARPU_PREFETCH):
			
 
				 as soon as a scheduling decision is taken for a task, requests are issued to
			
 
				 transfer its required data to the target processing unit, if needeed, so that
			
@@ -310,9 +310,9 @@ the handle and the desired target memory node.
 
				 \section Power-basedScheduling Power-based Scheduling
			
 
				 
			
 
				 If the application can provide some power performance model (through
			
 
				-the <c>power_model</c> field of the codelet structure), StarPU will
			
 
				+the field starpu_codelet::power_model), StarPU will
			
 
				 take it into account when distributing tasks. The target function that
			
 
				-the <c>dmda</c> scheduler minimizes becomes <c>alpha * T_execution +
			
 
				+the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
			
 
				 beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
			
 
				 is the estimated task consumption in Joules. To tune this parameter, use
			
 
				 <c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
			
@@ -333,7 +333,7 @@ On-line task consumption measurement is currently only supported through the
 
				 <c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
			
 
				 simulator. Applications can however provide explicit measurements by
			
 
				 using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
			
 
				-with the <c>power_model</c> performance model. Fine-grain
			
 
				+with the <c>power_model</c> performance model). Fine-grain
			
 
				 measurement is often not feasible with the feedback provided by the hardware, so
			
 
				 the user can for instance run a given task a thousand times, measure the global
			
 
				 consumption for that series of tasks, divide it by a thousand, repeat for
			
@@ -446,9 +446,9 @@ $ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
 
				 TEST PASSED
			
 
				 \endverbatim
			
 
				 
			
 
				-Note that we force to use the dmda scheduler to generate performance
			
 
				-models for the application. The application may need to be run several
			
 
				-times before the model is calibrated.
			
 
				+Note that we force to use the scheduler <c>dmda</c> to generate
			
 
				+performance models for the application. The application may need to be
			
 
				+run several times before the model is calibrated.
			
 
				 
			
 
				 \subsection Simulation Simulation
			
 
				 
			
--- a/doc/doxygen/chapters/performance_feedback.doxy
+++ b/doc/doxygen/chapters/performance_feedback.doxy
@@ -16,7 +16,7 @@ nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
 
				 install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
			
 
				 <c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
			
 
				 sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
			
 
				-to your application, any options you want to pass it, the path to libayudame.so.
			
 
				+to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
			
 
				 
			
 
				 Make sure to specify at least the same number of CPUs in the dialog box as your
			
 
				 machine has, otherwise an error will happen during execution. Future versions
			
@@ -35,7 +35,7 @@ call starpu_profiling_status_set() with the parameter
 
				 is already enabled or not by calling starpu_profiling_status_get().
			
 
				 Enabling monitoring also reinitialize all previously collected
			
 
				 feedback. The environment variable \ref STARPU_PROFILING can also be
			
 
				-set to 1 to achieve the same effect.
			
 
				+set to <c>1</c> to achieve the same effect.
			
 
				 
			
 
				 Likewise, performance monitoring is stopped by calling
			
 
				 starpu_profiling_status_set() with the parameter
			
@@ -247,7 +247,7 @@ Or you can simply point the <c>PKG_CONFIG_PATH</c> to
 
				 \ref with-fxt "--with-fxt" to <c>./configure</c>
			
 
				 
			
 
				 When FxT is enabled, a trace is generated when StarPU is terminated by calling
			
 
				-starpu_shutdown()). The trace is a binary file whose name has the form
			
 
				+starpu_shutdown(). The trace is a binary file whose name has the form
			
 
				 <c>prof_file_XXX_YYY</c> where <c>XXX</c> is the user name, and
			
 
				 <c>YYY</c> is the pid of the process that used StarPU. This file is saved in the
			
 
				 <c>/tmp/</c> directory by default, or by the directory specified by
			
@@ -269,7 +269,7 @@ application shutdown.
 
				 This will create a file <c>paje.trace</c> in the current directory that
			
 
				 can be inspected with the <a href="http://vite.gforge.inria.fr/">ViTE trace
			
 
				 visualizing open-source tool</a>.  It is possible to open the
			
 
				-<c>paje.trace</c> file with ViTE by using the following command:
			
 
				+file <c>paje.trace</c> with ViTE by using the following command:
			
 
				 
			
 
				 \verbatim
			
 
				 $ vite paje.trace
			
@@ -322,7 +322,7 @@ generate an activity trace by calling:
 
				 $ starpu_fxt_tool -i filename
			
 
				 \endverbatim
			
 
				 
			
 
				-This will create an <c>activity.data</c> file in the current
			
 
				+This will create a file <c>activity.data</c> in the current
			
 
				 directory. A profile of the application showing the activity of StarPU
			
 
				 during the execution of the program can be generated:
			
 
				 
			
@@ -341,7 +341,7 @@ efficiently. The black sections indicate that the processing unit was blocked
 
				 because there was no task to process: this may indicate a lack of parallelism
			
 
				 which may be alleviated by creating more tasks when it is possible.
			
 
				 
			
 
				-The second part of the <c>activity.eps</c> picture is a graph showing the
			
 
				+The second part of the picture <c>activity.eps</c> is a graph showing the
			
 
				 evolution of the number of tasks available in the system during the execution.
			
 
				 Ready tasks are shown in black, and tasks that are submitted but not
			
 
				 schedulable yet are shown in grey.
			
@@ -360,8 +360,8 @@ file: <starpu_slu_lu_model_22.hannibal>
 
				 file: <starpu_slu_lu_model_12.hannibal>
			
 
				 \endverbatim
			
 
				 
			
 
				-Here, the codelets of the lu example are available. We can examine the
			
 
				-performance of the 22 kernel (in micro-seconds), which is history-based:
			
 
				+Here, the codelets of the example <c>lu</c> are available. We can examine the
			
 
				+performance of the kernel <c>22</c> (in micro-seconds), which is history-based:
			
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_perfmodel_display -s starpu_slu_lu_model_22
			
@@ -414,7 +414,7 @@ starpu_perfmodel_load_symbol(). The source code of the tool
 
				 
			
 
				 The tool <c>starpu_perfmodel_plot</c> can be used to draw performance
			
 
				 models. It writes a <c>.gp</c> file in the current directory, to be
			
 
				-run in the <c>gnuplot</c> tool, which shows the corresponding curve.
			
 
				+run with the tool <c>gnuplot</c>, which shows the corresponding curve.
			
 
				 
			
 
				 When the field starpu_task::flops is set, <c>starpu_perfmodel_plot</c> can
			
 
				 directly draw a GFlops curve, by simply adding the <c>-f</c> option:
			
@@ -448,13 +448,13 @@ $ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_
 
				 It will produce a <c>.gp</c> file which contains both the performance model
			
 
				 curves, and the profiling measurements.
			
 
				 
			
 
				-If you have the <c>R</c> statistical tool installed, you can additionally use
			
 
				+If you have the statistical tool <c>R</c> installed, you can additionally use
			
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_codelet_histo_profile distrib.data
			
 
				 \endverbatim
			
 
				 
			
 
				-Which will create one pdf file per codelet and per input size, showing a
			
 
				+Which will create one <c>.pdf</c> file per codelet and per input size, showing a
			
 
				 histogram of the codelet execution time distribution.
			
 
				 
			
 
				 \section TheoreticalLowerBoundOnExecutionTime Theoretical Lower Bound On Execution Time
			
@@ -475,13 +475,13 @@ use this.
 
				 \section MemoryFeedback Memory Feedback
			
 
				 
			
 
				 It is possible to enable memory statistics. To do so, you need to pass
			
 
				-the option \ref enable-memory-stats "--enable-memory-stats" when running configure. It is then
			
 
				-possible to call the function starpu_display_memory_stats() to
			
 
				+the option \ref enable-memory-stats "--enable-memory-stats" when running <c>configure</c>. It is then
			
 
				+possible to call the function starpu_data_display_memory_stats() to
			
 
				 display statistics about the current data handles registered within StarPU.
			
 
				 
			
 
				 Moreover, statistics will be displayed at the end of the execution on
			
 
				 data handles which have not been cleared out. This can be disabled by
			
 
				-setting the environment variable \ref STARPU_MEMORY_STATS to 0.
			
 
				+setting the environment variable \ref STARPU_MEMORY_STATS to <c>0</c>.
			
 
				 
			
 
				 For example, if you do not unregister data at the end of the complex
			
 
				 example, you will get something similar to:
			
@@ -552,7 +552,7 @@ of the application. To enable them, you need to pass the option
 
				 starpu_shutdown() various statistics will be displayed,
			
 
				 execution, MSI cache statistics, allocation cache statistics, and data
			
 
				 transfer statistics. The display can be disabled by setting the
			
 
				-environment variable \ref STARPU_STATS to 0.
			
 
				+environment variable \ref STARPU_STATS to <c>0</c>.
			
 
				 
			
 
				 \verbatim
			
 
				 $ ./examples/cholesky/cholesky_tag
			
--- a/doc/doxygen/chapters/socl_opencl_extensions.doxy
+++ b/doc/doxygen/chapters/socl_opencl_extensions.doxy
@@ -18,4 +18,56 @@ the context to which the command queue is attached.
 
				 
			
 
				 Note: this is still an area under development and subject to change.
			
 
				 
			
 
				+When compiling StarPU, SOCL will be enabled if a valid OpenCL
			
 
				+implementation is found on your system. To be able to run the SOCL
			
 
				+test suite, the environment variable \ref SOCL_OCL_LIB_OPENCL needs to
			
 
				+be defined to the location of the file <c>libOpenCL.so</c> of the OCL
			
 
				+ICD implementation. You should for example add the following line in
			
 
				+your file <c>.bashrc</c>
			
 
				+
			
 
				+\verbatim
			
 
				+export SOCL_OCL_LIB_OPENCL=/usr/lib/x86_64-linux-gnu/libOpenCL.so
			
 
				+\endverbatim
			
 
				+
			
 
				+You can then run the test suite in the directory <c>socl/examples</c>.
			
 
				+
			
 
				+\verbatim
			
 
				+$ make check
			
 
				+...
			
 
				+PASS: basic/basic
			
 
				+PASS: testmap/testmap
			
 
				+PASS: clinfo/clinfo
			
 
				+PASS: matmul/matmul
			
 
				+PASS: mansched/mansched
			
 
				+==================
			
 
				+All 5 tests passed
			
 
				+==================
			
 
				+\endverbatim
			
 
				+
			
 
				+The environment variable \ref OCL_ICD_VENDORS has to point to the directory
			
 
				+where the ICD files are installed. When compiling StarPU, the files
			
 
				+are in the directory <c>socl/vendors</c>. With an installed version of
			
 
				+StarPU, the files are installed in the directory
			
 
				+<c>$prefix/share/starpu/opencl/vendors</c>.
			
 
				+
			
 
				+To run the tests by hand, you have to call for example,
			
 
				+
			
 
				+\verbatim
			
 
				+$ LD_PRELOAD=$SOCL_OCL_LIB_OPENCL OCL_ICD_VENDORS=socl/vendors/ socl/examples/clinfo/clinfo
			
 
				+Number of platforms:	2
			
 
				+  Plaform Profile:	FULL_PROFILE
			
 
				+  Plaform Version:	OpenCL 1.1 CUDA 4.2.1
			
 
				+  Plaform Name:		NVIDIA CUDA
			
 
				+  Plaform Vendor:	NVIDIA Corporation
			
 
				+  Plaform Extensions:	cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll 
			
 
				+
			
 
				+  Plaform Profile:	FULL_PROFILE
			
 
				+  Plaform Version:	OpenCL 1.0 SOCL Edition (0.1.0)
			
 
				+  Plaform Name:		SOCL Platform
			
 
				+  Plaform Vendor:	INRIA
			
 
				+  Plaform Extensions:	cl_khr_icd
			
 
				+....
			
 
				+$
			
 
				+\endverbatim
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/tips_and_tricks.doxy
+++ b/doc/doxygen/chapters/tips_and_tricks.doxy
@@ -95,4 +95,18 @@ Or add the following line in the file <c>/etc/sysctl.conf</c>
 
				 security.models.extensions.user_set_cpu_affinity=1
			
 
				 \endverbatim
			
 
				 
			
 
				+\section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
			
 
				+
			
 
				+Some users had issues with MKL 11 and StarPU (versions 1.1rc1 and
			
 
				+1.0.5) on Linux with MKL, using 1 thread for MKL and doing all the
			
 
				+parallelism using StarPU (no multithreaded tasks), setting the
			
 
				+environment variable MKL_NUM_THREADS to 1, and using the threaded MKL library,
			
 
				+with iomp5.
			
 
				+
			
 
				+Using this configuration, StarPU uses only 1 core, no matter the value of
			
 
				+\ref STARPU_NCPU. The problem is actually a thread pinning issue with MKL.
			
 
				+
			
 
				+The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
			
 
				+(http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/dev/checkDoc.sh
+++ b/doc/doxygen/dev/checkDoc.sh
@@ -0,0 +1,20 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+x=$(grep ingroup chapters/api/*|awk -F':' '{print $2}'| awk 'NF != 2')
			
 
				+if test -n "$x" ; then
			
 
				+    echo Errors on group definitions
			
 
				+    echo $x
			
 
				+fi
			
 
				+
			
 
				+echo
			
 
				+echo "Defined groups"
			
 
				+grep ingroup chapters/api/*|awk -F':' '{print $2}'| awk 'NF == 2'|sort|uniq
			
 
				+echo
			
 
				+
			
 
				+for f in ../../build/doc/doxygen/latex/*tex ; do
			
 
				+    x=$(grep $(basename $f .tex) refman.tex)
			
 
				+    if test -z "$x" ; then
			
 
				+	echo Error. $f not included in refman.tex
			
 
				+    fi
			
 
				+done
			
 
				+
			
--- a/doc/doxygen/dev/starpu_check_documented.py
+++ b/doc/doxygen/dev/starpu_check_documented.py
@@ -0,0 +1,38 @@
 
				+#!/usr/bin/python
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+class bcolors:
			
 
				+    FAILURE = '\033[91m'
			
 
				+    NORMAL = '\033[0m'
			
 
				+
			
 
				+def loadFunctionsAndDatatypes(flist, dtlist, fname):
			
 
				+    f = open(fname, 'r')
			
 
				+    for line in f:
			
 
				+        mline = line[:-1]
			
 
				+        if mline.count("\\fn"):
			
 
				+            if mline.count("fft") == 0:
			
 
				+                func = mline.replace("\\fn ", "")
			
 
				+                flist.append(list([func, fname]))
			
 
				+        if mline.count("\\struct ") or mline.count("\\def ") or mline.count("\\typedef ") or mline.count("\\enum "):
			
 
				+            datatype = mline.replace("\\struct ", "").replace("\\def ", "").replace("\\typedef ", "").replace("\\enum ","")
			
 
				+            dtlist.append(list([datatype, fname]))
			
 
				+    f.close()
			
 
				+
			
 
				+functions = []
			
 
				+datatypes = []
			
 
				+
			
 
				+for docfile in os.listdir('chapters/api'):
			
 
				+    if docfile.count(".doxy"):
			
 
				+        loadFunctionsAndDatatypes(functions, datatypes, "chapters/api/"+docfile)
			
 
				+
			
 
				+for function in functions:
			
 
				+    x = os.system("fgrep -l \"" + function[0] + "\" ../../include/*.h ../../mpi/include/*.h ../../starpufft/*h ../../sc_hypervisor/include/*.h > /dev/null")
			
 
				+    if x != 0:
			
 
				+        print "Function <" + bcolors.FAILURE + function[0] + bcolors.NORMAL + "> documented in <" + function[1] + "> does not exist in StarPU's API"
			
 
				+
			
 
				+for datatype in datatypes:
			
 
				+    x = os.system("fgrep -l \"" + datatype[0] + "\" ../../include/*.h ../../mpi/include/*.h ../../starpufft/*h ../../sc_hypervisor/include/*.h > /dev/null")
			
 
				+    if x != 0:
			
 
				+        print "Datatype <" + bcolors.FAILURE + datatype[0] + bcolors.NORMAL + "> documented in <" + datatype[1] + "> does not exist in StarPU's API"
			
 
				+
			
--- a/doc/doxygen/dev/starpu_check_undocumented.sh
+++ b/doc/doxygen/dev/starpu_check_undocumented.sh
@@ -0,0 +1,78 @@
 
				+#!/bin/bash
			
 
				+# Note: expects Coccinelle's spatch command n the PATH
			
 
				+# See: http://coccinelle.lip6.fr/
			
 
				+
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2011, 2012, 2013 Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+stcolor=$(tput sgr0)
			
 
				+redcolor=$(tput setaf 1)
			
 
				+greencolor=$(tput setaf 2)
			
 
				+
			
 
				+H_FILES=$(find ../../include ../../mpi/include -name '*.h')
			
 
				+
			
 
				+functions=$(spatch -very_quiet -sp_file ./dev/starpu_funcs.cocci $H_FILES)
			
 
				+for func in $functions ; do
			
 
				+	fname=$(echo $func|awk -F ',' '{print $1}')
			
 
				+	location=$(echo $func|awk -F ',' '{print $2}')
			
 
				+	x=$(grep "$fname(" chapters/api/*.doxy | grep "\\fn")
			
 
				+	if test "$x" == "" ; then
			
 
				+		echo "function ${redcolor}${fname}${stcolor} at location ${redcolor}$location${stcolor} is not (or incorrectly) documented"
			
 
				+#	else
			
 
				+#		echo "function ${greencolor}${fname}${stcolor} at location $location is correctly documented"
			
 
				+	fi
			
 
				+done
			
 
				+
			
 
				+echo
			
 
				+
			
 
				+structs=$(grep "struct starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
			
 
				+for struct in $structs ; do
			
 
				+    x=$(grep -F "\\struct $struct" chapters/api/*.doxy)
			
 
				+    if test "$x" == "" ; then
			
 
				+	echo "struct ${redcolor}${struct}${stcolor} is not (or incorrectly) documented"
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+echo
			
 
				+
			
 
				+enums=$(grep "enum starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
			
 
				+for enum in $enums ; do
			
 
				+    x=$(grep -F "\\enum $enum" chapters/api/*.doxy)
			
 
				+    if test "$x" == "" ; then
			
 
				+	echo "enum ${redcolor}${enum}${stcolor} is not (or incorrectly) documented"
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+echo
			
 
				+
			
 
				+macros=$(grep "define\b" $H_FILES |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
			
 
				+for macro in $macros ; do
			
 
				+    x=$(grep -F "\\def $macro" chapters/api/*.doxy)
			
 
				+    if test "$x" == "" ; then
			
 
				+	echo "macro ${redcolor}${macro}${stcolor} is not (or incorrectly) documented"
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+echo
			
 
				+
			
 
				+variables=$(grep --exclude-dir=.svn -rs -E "(getenv|get_env)" src/| tr ' ' '\012'|grep -E "(getenv|get_env)" | grep "\"" | sed 's/.*("//' | sed 's/").*//'|sort|uniq)
			
 
				+for variable in $variables ; do
			
 
				+    x=$(grep "$variable" chapters/environment_variables.doxy | grep "\\anchor")
			
 
				+    if test "$x" == "" ; then
			
 
				+	echo "variable ${redcolor}${variable}${stcolor} is not (or incorrectly) documented"
			
 
				+    fi
			
 
				+done
			
 
				+
			
--- a/doc/doxygen/dev/starpu_funcs.cocci
+++ b/doc/doxygen/dev/starpu_funcs.cocci
--- a/doc/doxygen/doxygen-config.cfg.in
+++ b/doc/doxygen/doxygen-config.cfg.in
@@ -18,8 +18,38 @@
 
				 
			
 
				 INPUT                  = @top_srcdir@/doc/doxygen/chapters \
			
 
				 		       	 @top_srcdir@/doc/doxygen/chapters/api \
			
 
				-                         @top_builddir@/include/starpu_config.h \
			
 
				-			 @top_srcdir@/include/ \
			
 
				+                         @top_builddir@/doc/doxygen/starpu_config.h \
			
 
				+	 		 @top_srcdir@/include/starpu_bound.h \
			
 
				+			 @top_srcdir@/include/starpu_cublas.h \
			
 
				+			 @top_srcdir@/include/starpu_cuda.h \
			
 
				+			 @top_srcdir@/include/starpu_data_filters.h \
			
 
				+			 @top_srcdir@/include/starpu_data.h \
			
 
				+			 @top_srcdir@/include/starpu_data_interfaces.h \
			
 
				+			 @top_srcdir@/include/starpu_deprecated_api.h \
			
 
				+			 @top_srcdir@/include/starpu_driver.h \
			
 
				+			 @top_srcdir@/include/starpu_expert.h \
			
 
				+			 @top_srcdir@/include/starpu_fxt.h \
			
 
				+			 @top_srcdir@/include/starpu.h \
			
 
				+			 @top_srcdir@/include/starpu_hash.h \
			
 
				+			 @top_srcdir@/include/starpu_mic.h \
			
 
				+			 @top_srcdir@/include/starpu_opencl.h \
			
 
				+			 @top_srcdir@/include/starpu_perfmodel.h \
			
 
				+			 @top_srcdir@/include/starpu_profiling.h \
			
 
				+			 @top_srcdir@/include/starpu_rand.h \
			
 
				+			 @top_srcdir@/include/starpu_scc.h \
			
 
				+			 @top_srcdir@/include/starpu_sched_ctx.h \
			
 
				+			 @top_srcdir@/include/starpu_scheduler.h \
			
 
				+			 @top_srcdir@/include/starpu_sink.h \
			
 
				+			 @top_srcdir@/include/starpu_stdlib.h \
			
 
				+			 @top_srcdir@/include/starpu_task_bundle.h \
			
 
				+			 @top_srcdir@/include/starpu_task.h \
			
 
				+			 @top_srcdir@/include/starpu_task_list.h \
			
 
				+			 @top_srcdir@/include/starpu_task_util.h \
			
 
				+			 @top_srcdir@/include/starpu_thread.h \
			
 
				+			 @top_srcdir@/include/starpu_thread_util.h \
			
 
				+			 @top_srcdir@/include/starpu_top.h \
			
 
				+			 @top_srcdir@/include/starpu_util.h \
			
 
				+			 @top_srcdir@/include/starpu_worker.h \
			
 
				 			 @top_srcdir@/mpi/include/ \
			
 
				 			 @top_srcdir@/starpufft/starpufft.h \
			
 
				 			 @top_srcdir@/sc_hypervisor/include
			
@@ -31,3 +61,4 @@ EXAMPLE_PATH           = @top_srcdir@/doc/doxygen \
 
				 INPUT_FILTER           = @top_builddir@/doc/doxygen/doxygen_filter.sh
			
 
				 
			
 
				 LATEX_HEADER           = @top_srcdir@/doc/doxygen/refman.tex
			
 
				+
			
--- a/doc/doxygen/doxygen.cfg
+++ b/doc/doxygen/doxygen.cfg
@@ -143,7 +143,7 @@ INLINE_INHERITED_MEMB  = NO
 
				 # path before files name in the file list and in the header files. If set
			
 
				 # to NO the shortest path that makes the file name unique will be used.
			
 
				 
			
 
				-FULL_PATH_NAMES        = YES
			
 
				+FULL_PATH_NAMES        = NO
			
 
				 
			
 
				 # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
			
 
				 # can be used to strip a user-defined part of the path. Stripping is
			
@@ -581,7 +581,7 @@ ENABLED_SECTIONS       =
 
				 # documentation can be controlled using \showinitializer or \hideinitializer
			
 
				 # command in the documentation regardless of this setting.
			
 
				 
			
 
				-MAX_INITIALIZER_LINES  = 30
			
 
				+MAX_INITIALIZER_LINES  = 0
			
 
				 
			
 
				 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated
			
 
				 # at the bottom of the documentation of classes and structs. If set to YES the
			
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -216,6 +216,52 @@ Documentation License”.
 
				 \input{group__API__Scheduling__Policy}
			
 
				 \input{group__API__Scheduling__Context__Hypervisor}
			
 
				 
			
 
				+\chapter{File Index}
			
 
				+\input{files}
			
 
				+
			
 
				+\chapter{File Documentation}
			
 
				+\label{FileDocumentation}
			
 
				+\hypertarget{FileDocumentation}{}
			
 
				+
			
 
				+\input{starpu_8h}
			
 
				+\input{starpu__bound_8h}
			
 
				+\input{starpu__config_8h}
			
 
				+\input{starpu__cublas_8h}
			
 
				+\input{starpu__cuda_8h}
			
 
				+\input{starpu__data_8h}
			
 
				+\input{starpu__data__filters_8h}
			
 
				+\input{starpu__data__interfaces_8h}
			
 
				+\input{starpu__deprecated__api_8h}
			
 
				+\input{starpu__driver_8h}
			
 
				+\input{starpu__expert_8h}
			
 
				+\input{starpu__fxt_8h}
			
 
				+\input{starpu__hash_8h}
			
 
				+\input{starpu__mic_8h}
			
 
				+\input{starpu__opencl_8h}
			
 
				+\input{starpu__perfmodel_8h}
			
 
				+\input{starpu__profiling_8h}
			
 
				+\input{starpu__rand_8h}
			
 
				+\input{starpu__scc_8h}
			
 
				+\input{starpu__sched__ctx_8h}
			
 
				+\input{starpu__scheduler_8h}
			
 
				+\input{starpu__sink_8h}
			
 
				+\input{starpu__stdlib_8h}
			
 
				+\input{starpu__task_8h}
			
 
				+\input{starpu__task__bundle_8h}
			
 
				+\input{starpu__task__list_8h}
			
 
				+\input{starpu__task__util_8h}
			
 
				+\input{starpu__thread_8h}
			
 
				+\input{starpu__thread__util_8h}
			
 
				+\input{starpu__top_8h}
			
 
				+\input{starpu__util_8h}
			
 
				+\input{starpu__worker_8h}
			
 
				+\input{starpu__mpi_8h}
			
 
				+\input{sc__hypervisor_8h}
			
 
				+\input{sc__hypervisor__config_8h}
			
 
				+\input{sc__hypervisor__lp_8h}
			
 
				+\input{sc__hypervisor__monitoring_8h}
			
 
				+\input{sc__hypervisor__policy_8h}
			
 
				+
			
 
				 \chapter{Deprecated List}
			
 
				 \label{deprecated}
			
 
				 \hypertarget{deprecated}{}
			
--- a/doc/texinfo/chapters/api.texi
+++ b/doc/texinfo/chapters/api.texi
@@ -2813,7 +2813,7 @@ Used by @code{STARPU_HISTORY_BASED} and @code{STARPU_NL_REGRESSION_BASED},
 
				 records all execution history measures.
			
 
				 
			
 
				 @item @code{struct starpu_perfmodel_regression_model regression}
			
 
				-Used by @code{STARPU_HISTORY_REGRESION_BASED} and
			
 
				+Used by @code{STARPU_REGRESSION_BASED} and
			
 
				 @code{STARPU_NL_REGRESSION_BASED}, contains the estimated factors of the
			
 
				 regression.
			
 
				 
			
--- a/doc/texinfo/dev/starpu_check_documented.py
+++ b/doc/texinfo/dev/starpu_check_documented.py
--- a/doc/texinfo/dev/starpu_check_undocumented.sh
+++ b/doc/texinfo/dev/starpu_check_undocumented.sh
--- a/doc/texinfo/dev/starpu_funcs.cocci
+++ b/doc/texinfo/dev/starpu_funcs.cocci
@@ -0,0 +1,28 @@
 
				+// StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+//
			
 
				+// Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+//
			
 
				+// StarPU is free software; you can redistribute it and/or modify
			
 
				+// it under the terms of the GNU Lesser General Public License as published by
			
 
				+// the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+// your option) any later version.
			
 
				+//
			
 
				+// StarPU is distributed in the hope that it will be useful, but
			
 
				+// WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+//
			
 
				+// See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+@starpufunc@
			
 
				+position p;
			
 
				+type t;
			
 
				+identifier f =~ "starpu";
			
 
				+@@
			
 
				+
			
 
				+t f@p( ... );
			
 
				+
			
 
				+@ script:python @
			
 
				+p << starpufunc.p;
			
 
				+f << starpufunc.f;
			
 
				+@@
			
 
				+print "%s,%s:%s" % (f,p[0].file,p[0].line)
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -60,6 +60,9 @@ int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mod
 
				 int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode);
			
 
				 int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
			
 
				 int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
			
 
				+int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency);
			
 
				+int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency);
			
 
				+
			
 
				 #ifdef __GCC__
			
 
				 #  define STARPU_DATA_ACQUIRE_CB(handle, mode, code) do \
			
 
				 	{ \						\
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -304,13 +304,13 @@ size_t starpu_csr_get_elemsize(starpu_data_handle_t handle);
 
				 #define STARPU_CSR_GET_NNZ(interface)	(((struct starpu_csr_interface *)(interface))->nnz)
			
 
				 #define STARPU_CSR_GET_NROW(interface)	(((struct starpu_csr_interface *)(interface))->nrow)
			
 
				 #define STARPU_CSR_GET_NZVAL(interface)	(((struct starpu_csr_interface *)(interface))->nzval)
			
 
				-#define STARPU_CSR_GET_NZVAL_DEV_HANDLE \
			
 
				+#define STARPU_CSR_GET_NZVAL_DEV_HANDLE(interface)		\
			
 
				 	(((struct starpu_csr_interface *)(interface))->nnz)
			
 
				 #define STARPU_CSR_GET_COLIND(interface)	(((struct starpu_csr_interface *)(interface))->colind)
			
 
				 #define STARPU_CSR_GET_COLIND_DEV_HANDLE(interface) \
			
 
				 	(((struct starpu_csr_interface *)(interface))->colind)
			
 
				 #define STARPU_CSR_GET_ROWPTR(interface)	(((struct starpu_csr_interface *)(interface))->rowptr)
			
 
				-#define STARPU_CSR_GET_ROWPTR_DEV_HANDLE \
			
 
				+#define STARPU_CSR_GET_ROWPTR_DEV_HANDLE(interface)		\
			
 
				 	(((struct starpu_csr_interface *)(interface))->rowptr)
			
 
				 #define STARPU_CSR_GET_OFFSET 0
			
 
				 #define STARPU_CSR_GET_FIRSTENTRY(interface)	(((struct starpu_csr_interface *)(interface))->firstentry)
			
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -61,12 +61,12 @@ void starpu_opencl_get_current_queue(cl_command_queue *queue);
 
				 
			
 
				 void starpu_opencl_load_program_source(const char *source_file_name, char *located_file_name, char *located_dir_name, char *opencl_program_source);
			
 
				 int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char *build_options);
			
 
				-int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char* build_options);
			
 
				+int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char *build_options);
			
 
				 
			
 
				 int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs);
			
 
				 
			
 
				-int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options);
			
 
				-int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char* build_options);
			
 
				+int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options);
			
 
				+int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char *build_options);
			
 
				 int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
			
 
				 
			
 
				 int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, const char *kernel_name, int devid);
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -67,14 +67,14 @@ unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
 
				 
			
 
				 void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
			
 
				 
			
 
				-void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
			
 
				+void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
			
 
				 
			
 
				 
			
 
				-struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
			
 
				+struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
			
 
				 
			
 
				 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
			
 
				 
			
 
				-struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
			
 
				+struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
			
 
				 
			
 
				 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
			
 
				 
			
@@ -112,7 +112,7 @@ int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio);
 
				 #define STARPU_DEFAULT_PRIO	0
			
 
				 
			
 
				 /* execute any parallel code on the workers of the sched_ctx (workers are blocked) */
			
 
				-void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, unsigned sched_ctx_id);
			
 
				+void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -239,7 +239,7 @@ void starpu_codelet_display_stats(struct starpu_codelet *cl);
 
				 
			
 
				 struct starpu_task *starpu_task_get_current(void);
			
 
				 
			
 
				-void starpu_parallel_task_barrier_init(struct starpu_task* task, int workerid);
			
 
				+void starpu_parallel_task_barrier_init(struct starpu_task *task, int workerid);
			
 
				 
			
 
				 struct starpu_task *starpu_task_dup(struct starpu_task *task);
			
 
				 
			
--- a/include/starpu_task_util.h
+++ b/include/starpu_task_util.h
@@ -29,7 +29,7 @@ extern "C"
 
				 {
			
 
				 #endif
			
 
				 
			
 
				-void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps,	void (*callback)(void *), void *callback_arg);
			
 
				+void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
			
 
				 
			
 
				 #define STARPU_VALUE		 (1<<19)
			
 
				 #define STARPU_CALLBACK		 (1<<20)
			
--- a/include/starpu_top.h
+++ b/include/starpu_top.h
@@ -82,50 +82,22 @@ enum starpu_top_message_type
 
				 	TOP_TYPE_UNKNOW
			
 
				 };
			
 
				 
			
 
				-struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name,
			
 
				-						    int active);
			
 
				-struct starpu_top_data *starpu_top_add_data_integer(const char *data_name,
			
 
				-						     int minimum_value,
			
 
				-						     int maximum_value,
			
 
				-						     int active);
			
 
				-struct starpu_top_data *starpu_top_add_data_float(const char *data_name,
			
 
				-						  double minimum_value,
			
 
				-						  double maximum_value,
			
 
				-						  int active);
			
 
				-struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name,
			
 
				-							       int *parameter_field,
			
 
				-							       void (*callback)(struct starpu_top_param*));
			
 
				-struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name,
			
 
				-							       int *parameter_field,
			
 
				-							       int minimum_value,
			
 
				-							       int maximum_value,
			
 
				-							       void (*callback)(struct starpu_top_param*));
			
 
				-struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name,
			
 
				-							     double *parameter_field,
			
 
				-							     double minimum_value,
			
 
				-							     double maximum_value,
			
 
				-							     void (*callback)(struct starpu_top_param*));
			
 
				-struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name,
			
 
				-							    int *parameter_field,
			
 
				-							    char **values,
			
 
				-							    int nb_values,
			
 
				-							    void (*callback)(struct starpu_top_param*));
			
 
				-
			
 
				-
			
 
				+struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name, int active);
			
 
				+struct starpu_top_data *starpu_top_add_data_integer(const char *data_name, int minimum_value, int maximum_value, int active);
			
 
				+struct starpu_top_data *starpu_top_add_data_float(const char *data_name, double minimum_value, double maximum_value, int active);
			
 
				 
			
 
				+struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name, int *parameter_field, void (*callback)(struct starpu_top_param*));
			
 
				+struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name, int *parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*));
			
 
				+struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name, double *parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*));
			
 
				+struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name, int *parameter_field, char **values, int nb_values, void (*callback)(struct starpu_top_param*));
			
 
				 
			
 
				 void starpu_top_init_and_wait(const char *server_name);
			
 
				 
			
 
				 void starpu_top_update_parameter(const struct starpu_top_param *param);
			
 
				-void starpu_top_update_data_boolean(const struct starpu_top_data *data,
			
 
				-				    int value);
			
 
				-void starpu_top_update_data_integer(const struct starpu_top_data *data,
			
 
				-				    int value);
			
 
				-void starpu_top_update_data_float(const struct starpu_top_data *data,
			
 
				-				  double value);
			
 
				-void starpu_top_task_prevision(struct starpu_task *task,
			
 
				-			       int devid, unsigned long long start,
			
 
				-			       unsigned long long end);
			
 
				+void starpu_top_update_data_boolean(const struct starpu_top_data *data, int value);
			
 
				+void starpu_top_update_data_integer(const struct starpu_top_data *data, int value);
			
 
				+void starpu_top_update_data_float(const struct starpu_top_data *data, double value);
			
 
				+void starpu_top_task_prevision(struct starpu_task *task, int devid, unsigned long long start, unsigned long long end);
			
 
				 
			
 
				 void starpu_top_debug_log(const char *message);
			
 
				 void starpu_top_debug_lock(const char *message);
			
--- a/mic-configure
+++ b/mic-configure
@@ -37,6 +37,7 @@ do
 
				 	params="--enable-mic --with-coi-dir=$coi_dir --prefix=$prefix/$arch"
			
 
				 
			
 
				 	if test x$arch = xmic ; then
			
 
				+		# TODO: fix hwloc detection to look for another pkg-config place, and not just believe in the host version of hwloc.pc...
			
 
				 		params="$params --without-hwloc --with-coi-lib-dir=$coi_dir/device-linux-release/lib --host=$mic_host"
			
 
				 	else
			
 
				 		params="$params --with-coi-lib-dir=$coi_dir/host-linux-release/lib"
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -57,18 +57,13 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 
				 
			
 
				 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
			
 
				 
			
 
				-struct _starpu_mpi_envelope
			
 
				-{
			
 
				-	ssize_t psize;
			
 
				-	int mpi_tag;
			
 
				-};
			
 
				-
			
 
				 struct _starpu_mpi_copy_handle
			
 
				 {
			
 
				 	starpu_data_handle_t handle;
			
 
				 	struct _starpu_mpi_envelope *env;
			
 
				 	int mpi_tag;
			
 
				 	UT_hash_handle hh;
			
 
				+	struct _starpu_mpi_req *req;
			
 
				 };
			
 
				 
			
 
				  /********************************************************/
			
@@ -176,135 +171,170 @@ static void delete_chandle(struct _starpu_mpi_copy_handle *chandle)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/********************************************************/
			
 
				-/*                                                      */
			
 
				-/*  Send/Receive functionalities                        */
			
 
				-/*                                                      */
			
 
				-/********************************************************/
			
 
				-
			
 
				-static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle_t data_handle,
			
 
				-							      int srcdst, int mpi_tag, MPI_Comm comm,
			
 
				-							      unsigned detached, void (*callback)(void *), void *arg,
			
 
				-							      enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
			
 
				-							      enum starpu_data_access_mode mode)
			
 
				+static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
			
 
				 {
			
 
				+	/* Initialize the request structure */
			
 
				+	req->data_handle = NULL;
			
 
				 
			
 
				-	_STARPU_MPI_LOG_IN();
			
 
				-	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
			
 
				-	STARPU_ASSERT_MSG(req, "Invalid request");
			
 
				+	req->datatype = NULL;
			
 
				+	req->ptr = NULL;
			
 
				+	req->count = -1;
			
 
				+	req->user_datatype = -1;
			
 
				 
			
 
				-	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				+	req->srcdst = -1;
			
 
				+	req->mpi_tag = -1;
			
 
				+	req->comm = 0;
			
 
				 
			
 
				-	/* Initialize the request structure */
			
 
				-	req->submitted = 0;
			
 
				-	req->completed = 0;
			
 
				+	req->func = NULL;
			
 
				+
			
 
				+	req->status = NULL;
			
 
				+	req->request = NULL;
			
 
				+	req->flag = NULL;
			
 
				+
			
 
				+	req->ret = -1;
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
			
 
				 	STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&req->posted_mutex, NULL);
			
 
				+	STARPU_PTHREAD_COND_INIT(&req->posted_cond, NULL);
			
 
				 
			
 
				-	req->request_type = request_type;
			
 
				-	req->user_datatype = -1;
			
 
				-	req->count = -1;
			
 
				-	req->data_handle = data_handle;
			
 
				-	req->srcdst = srcdst;
			
 
				-	req->mpi_tag = mpi_tag;
			
 
				-	req->comm = comm;
			
 
				+	req->request_type = UNKNOWN_REQ;
			
 
				 
			
 
				-	req->detached = detached;
			
 
				-	req->callback = callback;
			
 
				-	req->callback_arg = arg;
			
 
				+	req->submitted = 0;
			
 
				+	req->completed = 0;
			
 
				+	req->posted = 0;
			
 
				 
			
 
				-	req->func = func;
			
 
				+	req->other_request = NULL;
			
 
				 
			
 
				-	/* Asynchronously request StarPU to fetch the data in main memory: when
			
 
				-	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
			
 
				-	 * the request is actually submitted */
			
 
				-	starpu_data_acquire_cb(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req);
			
 
				+	req->detached = -1;
			
 
				+	req->callback = NULL;
			
 
				+	req->callback_arg = NULL;
			
 
				 
			
 
				-	_STARPU_MPI_LOG_OUT();
			
 
				-	return req;
			
 
				-}
			
 
				+	req->size_req = NULL;
			
 
				+	req->internal_req = NULL;
			
 
				+	req->is_internal_req = 0;
			
 
				+	req->envelope = NULL;
			
 
				+ }
			
 
				 
			
 
				-/********************************************************/
			
 
				-/*                                                      */
			
 
				-/*  Send functionalities                                */
			
 
				-/*                                                      */
			
 
				-/********************************************************/
			
 
				+ /********************************************************/
			
 
				+ /*                                                      */
			
 
				+ /*  Send/Receive functionalities                        */
			
 
				+ /*                                                      */
			
 
				+ /********************************************************/
			
 
				 
			
 
				-static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
			
 
				-{
			
 
				-	_STARPU_MPI_LOG_IN();
			
 
				+ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle_t data_handle,
			
 
				+							       int srcdst, int mpi_tag, MPI_Comm comm,
			
 
				+							       unsigned detached, void (*callback)(void *), void *arg,
			
 
				+							       enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
			
 
				+							       enum starpu_data_access_mode mode)
			
 
				+ {
			
 
				+
			
 
				+	 _STARPU_MPI_LOG_IN();
			
 
				+	 struct _starpu_mpi_req *req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				+	 STARPU_ASSERT_MSG(req, "Invalid request");
			
 
				+
			
 
				+	 _STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				+
			
 
				+	 /* Initialize the request structure */
			
 
				+	 _starpu_mpi_request_init(req);
			
 
				+	 req->request_type = request_type;
			
 
				+	 req->data_handle = data_handle;
			
 
				+	 req->srcdst = srcdst;
			
 
				+	 req->mpi_tag = mpi_tag;
			
 
				+	 req->comm = comm;
			
 
				+	 req->detached = detached;
			
 
				+	 req->callback = callback;
			
 
				+	 req->callback_arg = arg;
			
 
				+	 req->func = func;
			
 
				+
			
 
				+	 /* Asynchronously request StarPU to fetch the data in main memory: when
			
 
				+	  * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
			
 
				+	  * the request is actually submitted */
			
 
				+	 starpu_data_acquire_cb(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req);
			
 
				+
			
 
				+	 _STARPU_MPI_LOG_OUT();
			
 
				+	 return req;
			
 
				+ }
			
 
				 
			
 
				-	STARPU_ASSERT_MSG(req->ptr, "Pointer containing data to send is invalid");
			
 
				+ /********************************************************/
			
 
				+ /*                                                      */
			
 
				+ /*  Send functionalities                                */
			
 
				+ /*                                                      */
			
 
				+ /********************************************************/
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(2, "post MPI isend request %p type %s tag %d src %d data %p datasize %ld ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				+ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
			
 
				+ {
			
 
				+	 _STARPU_MPI_LOG_IN();
			
 
				 
			
 
				-	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
			
 
				+	 STARPU_ASSERT_MSG(req->ptr, "Pointer containing data to send is invalid");
			
 
				 
			
 
				-	TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
			
 
				+	 _STARPU_MPI_DEBUG(2, "post MPI isend request %p type %s tag %d src %d data %p datasize %ld ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				 
			
 
				-	req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
			
 
				-	STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
			
 
				+	 _starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
			
 
				 
			
 
				-	TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
			
 
				+	 TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
			
 
				 
			
 
				-	/* somebody is perhaps waiting for the MPI request to be posted */
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
			
 
				-	req->submitted = 1;
			
 
				-	STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
			
 
				+	 req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->srcdst, _starpu_mpi_tag, req->comm, &req->request);
			
 
				+	 STARPU_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %d", req->ret);
			
 
				 
			
 
				-	_starpu_mpi_handle_detached_request(req);
			
 
				+	 TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, 0);
			
 
				 
			
 
				-	_STARPU_MPI_LOG_OUT();
			
 
				-}
			
 
				+	 /* somebody is perhaps waiting for the MPI request to be posted */
			
 
				+	 STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
			
 
				+	 req->submitted = 1;
			
 
				+	 STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
			
 
				+	 STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
			
 
				 
			
 
				-static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
			
 
				-{
			
 
				-	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
			
 
				+	 _starpu_mpi_handle_detached_request(req);
			
 
				+
			
 
				+	 _STARPU_MPI_LOG_OUT();
			
 
				+ }
			
 
				 
			
 
				-	struct _starpu_mpi_envelope* env = calloc(1,sizeof(struct _starpu_mpi_envelope));
			
 
				+ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
			
 
				+ {
			
 
				+	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
			
 
				 
			
 
				-	env->mpi_tag = req->mpi_tag;
			
 
				+	req->envelope = calloc(1,sizeof(struct _starpu_mpi_envelope));
			
 
				+	req->envelope->mpi_tag = req->mpi_tag;
			
 
				 
			
 
				 	if (req->user_datatype == 0)
			
 
				 	{
			
 
				 		req->count = 1;
			
 
				 		req->ptr = starpu_data_get_local_ptr(req->data_handle);
			
 
				 
			
 
				-		env->psize = (ssize_t)req->count;
			
 
				+		req->envelope->psize = (ssize_t)req->count;
			
 
				 
			
 
				 		_STARPU_MPI_DEBUG(1, "Post MPI isend count (%ld) datatype_size %ld request to %d with tag %d\n",req->count,starpu_data_get_size(req->data_handle),req->srcdst, _starpu_mpi_tag);
			
 
				-		MPI_Isend(env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				+		MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		int ret;
			
 
				 
			
 
				  		// Do not pack the data, just try to find out the size
			
 
				-		starpu_data_pack(req->data_handle, NULL, &(env->psize));
			
 
				+		starpu_data_pack(req->data_handle, NULL, &(req->envelope->psize));
			
 
				 
			
 
				-		if (env->psize != -1)
			
 
				+		if (req->envelope->psize != -1)
			
 
				  		{
			
 
				  			// We already know the size of the data, let's send it to overlap with the packing of the data
			
 
				-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", env->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
			
 
				-			req->count = env->psize;
			
 
				-			ret = MPI_Isend(env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", req->envelope->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
			
 
				+			req->count = req->envelope->psize;
			
 
				+			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				 			STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %d", ret);
			
 
				  		}
			
 
				 
			
 
				  		// Pack the data
			
 
				  		starpu_data_pack(req->data_handle, &req->ptr, &req->count);
			
 
				-		if (env->psize == -1)
			
 
				+		if (req->envelope->psize == -1)
			
 
				  		{
			
 
				  			// We know the size now, let's send it
			
 
				-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", env->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
			
 
				-			ret = MPI_Isend(env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", req->envelope->psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), _starpu_mpi_tag, req->srcdst);
			
 
				+			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->srcdst, _starpu_mpi_tag, req->comm, &req->size_req);
			
 
				 			STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %d", ret);
			
 
				  		}
			
 
				  		else
			
 
				  		{
			
 
				  			// We check the size returned with the 2 calls to pack is the same
			
 
				-			STARPU_ASSERT_MSG(req->count == env->psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, env->psize);
			
 
				+			STARPU_ASSERT_MSG(req->count == req->envelope->psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, req->envelope->psize);
			
 
				  		}
			
 
				 		// We can send the data now
			
 
				 	}
			
@@ -400,6 +430,13 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 	STARPU_ASSERT_MSG(public_req, "starpu_mpi_irecv needs a valid starpu_mpi_req");
			
 
				 
			
 
				+	// We check if a tag is defined for the data handle, if not,
			
 
				+	// we define the one given for the communication.
			
 
				+	// A tag is necessary for the internal mpi engine.
			
 
				+	int tag = starpu_data_get_tag(data_handle);
			
 
				+	if (tag == -1)
			
 
				+		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				+
			
 
				 	struct _starpu_mpi_req *req;
			
 
				 	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL);
			
 
				 
			
@@ -413,7 +450,16 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 
				 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
			
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				+
			
 
				+	// We check if a tag is defined for the data handle, if not,
			
 
				+	// we define the one given for the communication.
			
 
				+	// A tag is necessary for the internal mpi engine.
			
 
				+	int tag = starpu_data_get_tag(data_handle);
			
 
				+	if (tag == -1)
			
 
				+		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				+
			
 
				 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
			
 
				+
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 	return 0;
			
 
				 }
			
@@ -421,8 +467,15 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 
				 int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
			
 
				 {
			
 
				 	starpu_mpi_req req;
			
 
				-
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				+
			
 
				+	// We check if a tag is defined for the data handle, if not,
			
 
				+	// we define the one given for the communication.
			
 
				+	// A tag is necessary for the internal mpi engine.
			
 
				+	int tag = starpu_data_get_tag(data_handle);
			
 
				+	if (tag == -1)
			
 
				+		starpu_data_set_tag(data_handle, mpi_tag);
			
 
				+
			
 
				 	starpu_mpi_irecv(data_handle, &req, source, mpi_tag, comm);
			
 
				 	starpu_mpi_wait(&req, status);
			
 
				 
			
@@ -457,8 +510,11 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 	int ret;
			
 
				-	struct _starpu_mpi_req *waiting_req = calloc(1, sizeof(struct _starpu_mpi_req));
			
 
				+
			
 
				+	struct _starpu_mpi_req *waiting_req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				+	_starpu_mpi_request_init(waiting_req);
			
 
				 	STARPU_ASSERT_MSG(waiting_req, "Allocation failed");
			
 
				+
			
 
				 	struct _starpu_mpi_req *req = *public_req;
			
 
				 
			
 
				 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
@@ -549,9 +605,9 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
				 
			
 
				 	if (submitted)
			
 
				 	{
			
 
				-		struct _starpu_mpi_req *testing_req = calloc(1, sizeof(struct _starpu_mpi_req));
			
 
				+		struct _starpu_mpi_req *testing_req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				 		STARPU_ASSERT_MSG(testing_req, "allocation failed");
			
 
				-		//		memset(testing_req, 0, sizeof(struct _starpu_mpi_req));
			
 
				+		_starpu_mpi_request_init(testing_req);
			
 
				 
			
 
				 		/* Initialize the request structure */
			
 
				 		STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
			
@@ -615,8 +671,9 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 	int ret;
			
 
				-	struct _starpu_mpi_req *barrier_req = calloc(1, sizeof(struct _starpu_mpi_req));
			
 
				+	struct _starpu_mpi_req *barrier_req = malloc(sizeof(struct _starpu_mpi_req));
			
 
				 	STARPU_ASSERT_MSG(barrier_req, "allocation failed");
			
 
				+	_starpu_mpi_request_init(barrier_req);
			
 
				 
			
 
				 	/* First wait for *both* all tasks and MPI requests to finish, in case
			
 
				 	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
			
@@ -681,6 +738,7 @@ static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type
 
				 		case WAIT_REQ: return "WAIT_REQ";
			
 
				 		case TEST_REQ: return "TEST_REQ";
			
 
				 		case BARRIER_REQ: return "BARRIER_REQ";
			
 
				+		case UNKNOWN_REQ: return "UNSET_REQ";
			
 
				 		default: return "unknown request type";
			
 
				 		}
			
 
				 }
			
@@ -725,12 +783,25 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				+				_STARPU_MPI_DEBUG(3, "NOT deleting chandle %p from hashmap (tag %d %d)\n", chandle, req->mpi_tag, starpu_data_get_tag(req->data_handle));
			
 
				 				_starpu_mpi_handle_free_datatype(req->data_handle, &req->datatype);
			
 
				 			}
			
 
				 		}
			
 
				 		starpu_data_release(req->data_handle);
			
 
				 	}
			
 
				 
			
 
				+	if (req->envelope)
			
 
				+	{
			
 
				+		free(req->envelope);
			
 
				+		req->envelope = NULL;
			
 
				+	}
			
 
				+
			
 
				+	if (req->internal_req)
			
 
				+	{
			
 
				+		free(req->internal_req);
			
 
				+		req->internal_req = NULL;
			
 
				+	}
			
 
				+
			
 
				 	/* Execute the specified callback, if any */
			
 
				 	if (req->callback)
			
 
				 		req->callback(req->callback_arg);
			
@@ -755,6 +826,11 @@ static void _starpu_mpi_copy_cb(void* arg)
 
				 {
			
 
				 	struct _starpu_mpi_copy_cb_args *args = arg;
			
 
				 
			
 
				+	// We store in the application request the internal MPI
			
 
				+	// request so that it can be used by starpu_mpi_wait
			
 
				+	args->req->request = args->req->internal_req->request;
			
 
				+	args->req->submitted = 1;
			
 
				+
			
 
				 	struct starpu_data_interface_ops *itf = starpu_data_get_interface_ops(args->copy_handle);
			
 
				 	void* itf_src = starpu_data_get_interface_on_node(args->copy_handle,0);
			
 
				 	void* itf_dst = starpu_data_get_interface_on_node(args->data_handle,0);
			
@@ -777,7 +853,11 @@ static void _starpu_mpi_copy_cb(void* arg)
 
				 	starpu_data_unregister_submit(args->copy_handle);
			
 
				 
			
 
				 	_STARPU_MPI_DEBUG(3, "Done, handling request %p termination of the already received request\n",args->req);
			
 
				-	_starpu_mpi_handle_request_termination(args->req);
			
 
				+	if (args->req->detached)
			
 
				+		_starpu_mpi_handle_request_termination(args->req);
			
 
				+	// else: If the request is not detached its termination will
			
 
				+	// be handled when calling starpu_mpi_wait
			
 
				+
			
 
				 
			
 
				 	free(args);
			
 
				 }
			
@@ -789,6 +869,8 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 
			
 
				 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
			
 
				 
			
 
				+	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p tag %d and type %s\n", req, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
			
 
				+
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				 	if (req->request_type == RECV_REQ)
			
@@ -804,6 +886,8 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
			
 
				 
			
 
				+			req->internal_req = chandle->req;
			
 
				+
			
 
				 			struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
			
 
				 			cb_args->data_handle = req->data_handle;
			
 
				 			cb_args->copy_handle = chandle->handle;
			
@@ -835,9 +919,16 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 					STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
			
 
				 				}
			
 
				 
			
 
				+				_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				 				_starpu_mpi_req_list_push_front(new_requests, req);
			
 
				 
			
 
				-				_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				+				/* inform the starpu mpi thread that the request has beenbe pushed in the new_requests list */
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
			
 
				+				req->posted = 1;
			
 
				+				STARPU_PTHREAD_COND_BROADCAST(&req->posted_cond);
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&req->posted_mutex);
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 			}
			
 
				 			/* Case : a classic receive request with no send received earlier than expected.
			
 
				 			 * We just add the pending receive request to the requests' hashmap. */
			
@@ -931,7 +1022,8 @@ static void _starpu_mpi_test_detached_requests(void)
 
				 		if (flag)
			
 
				 		{
			
 
				 			_starpu_mpi_req_list_erase(detached_requests, req);
			
 
				-			free(req);
			
 
				+			if (!req->is_internal_req)
			
 
				+				free(req);
			
 
				 		}
			
 
				 
			
 
				 	}
			
@@ -1041,13 +1133,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				  	struct _starpu_mpi_envelope *recv_env = calloc(1,sizeof(struct _starpu_mpi_envelope));
			
 
				 
			
 
				- 	MPI_Request header_req;
			
 
				  	int header_req_submitted = 0;
			
 
				 
			
 
				 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
			
 
				 	{
			
 
				 		/* shall we block ? */
			
 
				-		_STARPU_MPI_DEBUG(3, "HASH_COUNT(_starpu_mpi_req_hashmap) = %d\n",HASH_COUNT(_starpu_mpi_req_hashmap));
			
 
				 		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (HASH_COUNT(_starpu_mpi_req_hashmap) == 0);
			
 
				 
			
 
				 #ifndef STARPU_MPI_ACTIVITY
			
@@ -1085,11 +1175,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
			
 
				 		 * requests in our side, we resubmit a header request. */
			
 
				-		if ((HASH_COUNT(_starpu_mpi_req_hashmap) > 0) && (header_req_submitted == 0) && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
			
 
				+		MPI_Request header_req;
			
 
				+		if ((HASH_COUNT(_starpu_mpi_req_hashmap) > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
			
 
				 		{
			
 
				+			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop\n");
			
 
				 			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
			
 
				-
			
 
				-			_STARPU_MPI_DEBUG(3, "Submit of header_req OK!\n");
			
 
				 			header_req_submitted = 1;
			
 
				 		}
			
 
				 
			
@@ -1102,7 +1192,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		{
			
 
				 			int flag,res;
			
 
				 			MPI_Status status;
			
 
				-			_STARPU_MPI_DEBUG(3, "Test of header_req\n");
			
 
				+			_STARPU_MPI_DEBUG(4, "Test of header_req\n");
			
 
				 
			
 
				 			/* test whether an envelope has arrived. */
			
 
				 			res = MPI_Test(&header_req, &flag, &status);
			
@@ -1110,9 +1200,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 			if (flag)
			
 
				 			{
			
 
				-				_STARPU_MPI_DEBUG(3, "header_req received !\n");
			
 
				-
			
 
				-				_STARPU_MPI_DEBUG(3, "Searching for request with tag %d, size %ld ..\n",recv_env->mpi_tag, recv_env->psize);
			
 
				+				_STARPU_MPI_DEBUG(3, "Searching for request with tag %d (size %ld)\n", recv_env->mpi_tag, recv_env->psize);
			
 
				 
			
 
				 				struct _starpu_mpi_req *found_req = find_req(recv_env->mpi_tag);
			
 
				 
			
@@ -1127,7 +1215,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 					while(!(data_handle))
			
 
				 					{
			
 
				+						STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 						data_handle = starpu_data_get_data_handle_from_tag(recv_env->mpi_tag);
			
 
				+						STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 					}
			
 
				 					STARPU_ASSERT(data_handle);
			
 
				 
			
@@ -1139,12 +1229,21 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					starpu_data_register_same(&chandle->handle, data_handle);
			
 
				 					add_chandle(chandle);
			
 
				 
			
 
				-					_STARPU_MPI_DEBUG(3, "Posting internal starpu_irecv_detached on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
			
 
				-
			
 
				-					res = starpu_mpi_irecv_detached(chandle->handle,status.MPI_SOURCE,chandle->mpi_tag,MPI_COMM_WORLD,NULL,NULL);
			
 
				-					STARPU_ASSERT(res == MPI_SUCCESS);
			
 
				+					_STARPU_MPI_DEBUG(3, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
			
 
				+					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL);
			
 
				+					chandle->req->is_internal_req = 1;
			
 
				 
			
 
				-					_STARPU_MPI_DEBUG(3, "Success of starpu_irecv_detached on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
			
 
				+					// We wait until the request is pushed in the
			
 
				+					// new_request list, that ensures that the next loop
			
 
				+					// will call _starpu_mpi_handle_new_request
			
 
				+					// on the request and post the corresponding mpi_irecv,
			
 
				+					// otherwise, it may lead to read data as envelop
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&(chandle->req->posted_mutex));
			
 
				+					while (!(chandle->req->posted))
			
 
				+					     STARPU_PTHREAD_COND_WAIT(&(chandle->req->posted_cond), &(chandle->req->posted_mutex));
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&(chandle->req->posted_mutex));
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 				}
			
 
				 				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
			
 
				 				 * the data handle, then submit the corresponding receive with _starpu_mpi_handle_new_request. */
			
@@ -1181,7 +1280,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				-				_STARPU_MPI_DEBUG(3, "Nothing received, continue ..\n");
			
 
				+				_STARPU_MPI_DEBUG(4, "Nothing received, continue ..\n");
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
--- a/mpi/src/starpu_mpi_datatype.c
+++ b/mpi/src/starpu_mpi_datatype.c
@@ -227,5 +227,6 @@ char *_starpu_mpi_datatype(MPI_Datatype datatype)
 
				      if (datatype == MPI_INTEGER4) return "MPI_INTEGER4";
			
 
				      if (datatype == MPI_INTEGER8) return "MPI_INTEGER8";
			
 
				      if (datatype == MPI_PACKED) return "MPI_PACKED";
			
 
				+     if (datatype == 0) return "Unknown datatype";
			
 
				      return "User defined MPI Datatype";
			
 
				 }
			
--- a/mpi/src/starpu_mpi_private.c
+++ b/mpi/src/starpu_mpi_private.c
@@ -18,12 +18,18 @@
 
				 #include <starpu_mpi_private.h>
			
 
				 
			
 
				 int _debug_rank=-1;
			
 
				-int _debug_level=0;
			
 
				+int _debug_level_min=0;
			
 
				+int _debug_level_max=0;
			
 
				 int _starpu_mpi_tag = 42;
			
 
				 
			
 
				-void _starpu_mpi_set_debug_level(int level)
			
 
				+void _starpu_mpi_set_debug_level_min(int level)
			
 
				 {
			
 
				-	_debug_level = level;
			
 
				+	_debug_level_min = level;
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_set_debug_level_max(int level)
			
 
				+{
			
 
				+	_debug_level_max = level;
			
 
				 }
			
 
				 
			
 
				 int starpu_mpi_get_communication_tag(void)
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -31,18 +31,20 @@ extern "C" {
 
				 
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 extern int _debug_rank;
			
 
				-extern int _debug_level;
			
 
				-void _starpu_mpi_set_debug_level(int level);
			
 
				+extern int _debug_level_min;
			
 
				+extern int _debug_level_max;
			
 
				+void _starpu_mpi_set_debug_level_min(int level);
			
 
				+void _starpu_mpi_set_debug_level_max(int level);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
			
 
				 	do \
			
 
				 	{								\
			
 
				-		if (!getenv("STARPU_SILENT") && level <= _debug_level)	\
			
 
				+		if (!getenv("STARPU_SILENT") && _debug_level_min <= level && level <= _debug_level_max)	\
			
 
				 		{							\
			
 
				 			if (_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank); \
			
 
				-			fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				+			fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_debug_rank+1)*4, "", _debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
			
 
				 			fflush(stderr); \
			
 
				 		}			\
			
 
				 	} while(0);
			
@@ -52,17 +54,17 @@ void _starpu_mpi_set_debug_level(int level);
 
				 
			
 
				 #define _STARPU_MPI_DISP(fmt, ...) do { if (!getenv("STARPU_SILENT")) { \
			
 
				 	       				     if (_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank); \
			
 
				-                                             fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				+                                             fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_debug_rank+1)*4, "", _debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
			
 
				                                              fflush(stderr); }} while(0);
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE0
			
 
				 #  define _STARPU_MPI_LOG_IN()             do { if (!getenv("STARPU_SILENT")) { \
			
 
				                                                if (_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
			
 
				-                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] -->\n", (_debug_rank+1)*4, "", _debug_rank, __starpu_func__ ); \
			
 
				+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] -->\n", (_debug_rank+1)*4, "", _debug_rank, __starpu_func__ , __LINE__); \
			
 
				                                                fflush(stderr); }} while(0)
			
 
				 #  define _STARPU_MPI_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) { \
			
 
				                                                if (_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
			
 
				-                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] <--\n", (_debug_rank+1)*4, "", _debug_rank, __starpu_func__ ); \
			
 
				+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] <--\n", (_debug_rank+1)*4, "", _debug_rank, __starpu_func__, __LINE__ ); \
			
 
				                                                fflush(stderr); }} while(0)
			
 
				 #else
			
 
				 #  define _STARPU_MPI_LOG_IN()
			
@@ -78,9 +80,18 @@ enum _starpu_mpi_request_type
 
				 	WAIT_REQ=2,
			
 
				 	TEST_REQ=3,
			
 
				 	BARRIER_REQ=4,
			
 
				-	PROBE_REQ=5
			
 
				+	PROBE_REQ=5,
			
 
				+	UNKNOWN_REQ=6,
			
 
				 };
			
 
				 
			
 
				+struct _starpu_mpi_envelope
			
 
				+{
			
 
				+	ssize_t psize;
			
 
				+	int mpi_tag;
			
 
				+};
			
 
				+
			
 
				+struct _starpu_mpi_req;
			
 
				+
			
 
				 LIST_TYPE(_starpu_mpi_req,
			
 
				 	/* description of the data at StarPU level */
			
 
				 	starpu_data_handle_t data_handle;
			
@@ -106,10 +117,14 @@ LIST_TYPE(_starpu_mpi_req,
 
				 	starpu_pthread_mutex_t req_mutex;
			
 
				 	starpu_pthread_cond_t req_cond;
			
 
				 
			
 
				+	starpu_pthread_mutex_t posted_mutex;
			
 
				+	starpu_pthread_cond_t posted_cond;
			
 
				+
			
 
				 	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
			
 
				 
			
 
				 	unsigned submitted;
			
 
				 	unsigned completed;
			
 
				+	unsigned posted;
			
 
				 
			
 
				 	UT_hash_handle hh;
			
 
				 
			
@@ -124,6 +139,11 @@ LIST_TYPE(_starpu_mpi_req,
 
				 
			
 
				         /* in the case of user-defined datatypes, we need to send the size of the data */
			
 
				 	MPI_Request size_req;
			
 
				+
			
 
				+        struct _starpu_mpi_envelope* envelope;
			
 
				+
			
 
				+	int is_internal_req;
			
 
				+	struct _starpu_mpi_req *internal_req;
			
 
				 );
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -80,10 +80,13 @@ starpu_mpi_TESTS =				\
 
				 	pingpong				\
			
 
				 	mpi_test				\
			
 
				 	mpi_isend				\
			
 
				+	mpi_earlyrecv				\
			
 
				+	mpi_earlyrecv2				\
			
 
				 	mpi_irecv				\
			
 
				 	mpi_isend_detached			\
			
 
				 	mpi_irecv_detached			\
			
 
				 	mpi_detached_tag			\
			
 
				+	mpi_redux				\
			
 
				 	ring					\
			
 
				 	ring_async				\
			
 
				 	ring_async_implicit			\
			
@@ -104,10 +107,13 @@ noinst_PROGRAMS =				\
 
				 	pingpong				\
			
 
				 	mpi_test				\
			
 
				 	mpi_isend				\
			
 
				+	mpi_earlyrecv				\
			
 
				+	mpi_earlyrecv2				\
			
 
				 	mpi_irecv				\
			
 
				 	mpi_isend_detached			\
			
 
				 	mpi_irecv_detached			\
			
 
				 	mpi_detached_tag			\
			
 
				+	mpi_redux				\
			
 
				 	ring					\
			
 
				 	ring_async				\
			
 
				 	ring_async_implicit			\
			
@@ -126,6 +132,10 @@ noinst_PROGRAMS =				\
 
				 
			
 
				 mpi_isend_LDADD =					\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_earlyrecv_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_earlyrecv2_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 mpi_irecv_LDADD =					\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 mpi_isend_detached_LDADD =			\
			
@@ -134,6 +144,8 @@ mpi_irecv_detached_LDADD =			\
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 mpi_detached_tag_LDADD =				\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_redux_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 pingpong_LDADD =					\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 mpi_test_LDADD =					\
			
--- a/mpi/tests/mpi_earlyrecv.c
+++ b/mpi/tests/mpi_earlyrecv.c
@@ -0,0 +1,102 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+//#define NB 1000
			
 
				+#define NB 10
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size, i, nb_requests;
			
 
				+	starpu_data_handle_t tab_handle[NB];
			
 
				+	starpu_mpi_req request[NB];
			
 
				+
			
 
				+	MPI_Init(NULL, NULL);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	for(i=0 ; i<NB ; i++)
			
 
				+	{
			
 
				+		starpu_variable_data_register(&tab_handle[i], 0, (uintptr_t)&rank, sizeof(int));
			
 
				+		starpu_data_set_tag(tab_handle[i], i);
			
 
				+		request[i] = NULL;
			
 
				+	}
			
 
				+
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	fprintf(stderr, "rank %d exchanging with rank %d\n", rank, other_rank);
			
 
				+
			
 
				+	if (rank%2)
			
 
				+	{
			
 
				+		starpu_mpi_isend(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_recv(tab_handle[2], other_rank, 2, MPI_COMM_WORLD, NULL);
			
 
				+		starpu_mpi_isend(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
			
 
				+		nb_requests = 2;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starpu_mpi_irecv(tab_handle[0], &request[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_irecv(tab_handle[1], &request[1], other_rank, 1, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_isend(tab_handle[2], &request[2], other_rank, 2, MPI_COMM_WORLD);
			
 
				+		nb_requests = 3;
			
 
				+	}
			
 
				+
			
 
				+	int finished=0;
			
 
				+	while (!finished)
			
 
				+	{
			
 
				+		for(i=0 ; i<nb_requests ; i++)
			
 
				+		{
			
 
				+			if (request[i])
			
 
				+			{
			
 
				+				int flag;
			
 
				+				MPI_Status status;
			
 
				+				starpu_mpi_test(&request[i], &flag, &status);
			
 
				+				if (flag)
			
 
				+					fprintf(stderr, "request[%d] = %d %p\n", i, flag, request[i]);
			
 
				+			}
			
 
				+		}
			
 
				+		finished = request[0] == NULL;
			
 
				+		for(i=1 ; i<nb_requests ; i++) finished = finished && request[i] == NULL;
			
 
				+	}
			
 
				+
			
 
				+	for(i=0 ; i<NB ; i++)
			
 
				+		starpu_data_unregister(tab_handle[i]);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/mpi/tests/mpi_earlyrecv2.c
+++ b/mpi/tests/mpi_earlyrecv2.c
@@ -0,0 +1,94 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+//#define NB 1000
			
 
				+#define NB 10
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size, i;
			
 
				+	starpu_data_handle_t tab_handle[NB];
			
 
				+
			
 
				+	MPI_Init(NULL, NULL);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	for(i=0 ; i<NB ; i++)
			
 
				+	{
			
 
				+		starpu_variable_data_register(&tab_handle[i], 0, (uintptr_t)&rank, sizeof(int));
			
 
				+		starpu_data_set_tag(tab_handle[i], i);
			
 
				+	}
			
 
				+
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	if (rank%2)
			
 
				+	{
			
 
				+		starpu_mpi_send(tab_handle[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_send(tab_handle[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
			
 
				+		for(i=1 ; i<NB-1 ; i++)
			
 
				+		{
			
 
				+			starpu_mpi_send(tab_handle[i], other_rank, i, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starpu_mpi_req req[NB];
			
 
				+		memset(req, 0, NB*sizeof(starpu_mpi_req));
			
 
				+
			
 
				+		starpu_mpi_irecv(tab_handle[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				+		STARPU_ASSERT(req[0] != NULL);
			
 
				+		// We sleep to make sure that the data for the tag 9 will be received before the recv is posted
			
 
				+		usleep(2000000);
			
 
				+		for(i=1 ; i<NB ; i++)
			
 
				+		{
			
 
				+			starpu_mpi_irecv(tab_handle[i], &req[i], other_rank, i, MPI_COMM_WORLD);
			
 
				+			STARPU_ASSERT(req[i] != NULL);
			
 
				+		}
			
 
				+		for(i=0 ; i<NB ; i++)
			
 
				+		{
			
 
				+			starpu_mpi_wait(&req[i], NULL);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for(i=0 ; i<NB ; i++)
			
 
				+		starpu_data_unregister(tab_handle[i]);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/mpi/tests/mpi_redux.c
+++ b/mpi/tests/mpi_redux.c
@@ -0,0 +1,107 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
			
 
				+
			
 
				+void callback(void *arg)
			
 
				+{
			
 
				+	unsigned *received = arg;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+	*received = *received + 1;
			
 
				+	fprintf(stderr, "received = %d\n", *received);
			
 
				+	STARPU_PTHREAD_COND_SIGNAL(&cond);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size, sum;
			
 
				+	int value=0;
			
 
				+	starpu_data_handle_t *handles;
			
 
				+
			
 
				+	MPI_Init(NULL, NULL);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	sum = ((size-1) * (size) / 2);
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		int src;
			
 
				+		int received = 1;
			
 
				+
			
 
				+		handles = malloc(size * sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+		for(src=1 ; src<size ; src++)
			
 
				+		{
			
 
				+			starpu_variable_data_register(&handles[src], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+			starpu_mpi_irecv_detached(handles[src], src, 12+src, MPI_COMM_WORLD, callback, &received);
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+		while (received != size)
			
 
				+			STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+
			
 
				+		for(src=1 ; src<size ; src++)
			
 
				+		{
			
 
				+			void *ptr = starpu_data_get_local_ptr(handles[src]);
			
 
				+			value += *((int *)ptr);
			
 
				+			starpu_data_unregister(handles[src]);
			
 
				+		}
			
 
				+
			
 
				+		for(src=1 ; src<size ; src++)
			
 
				+		{
			
 
				+			starpu_variable_data_register(&handles[src], 0, (uintptr_t)&sum, sizeof(int));
			
 
				+			starpu_mpi_send(handles[src], src, 12+src, MPI_COMM_WORLD);
			
 
				+			starpu_data_unregister(handles[src]);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		value = rank;
			
 
				+		handles = malloc(sizeof(starpu_data_handle_t));
			
 
				+		starpu_variable_data_register(&handles[0], 0, (uintptr_t)&value, sizeof(int));
			
 
				+		starpu_mpi_send(handles[0], 0, 12+rank, MPI_COMM_WORLD);
			
 
				+		starpu_data_unregister_submit(handles[0]);
			
 
				+
			
 
				+		starpu_variable_data_register(&handles[0], 0, (uintptr_t)&value, sizeof(int));
			
 
				+		starpu_mpi_recv(handles[0], 0, 12+rank, MPI_COMM_WORLD, NULL);
			
 
				+		starpu_data_unregister(handles[0]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	free(handles);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(sum == value, "Sum of first %d integers is %d, not %d\n", size-1, sum, value);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/sc_hypervisor/examples/Makefile.am
+++ b/sc_hypervisor/examples/Makefile.am
@@ -18,12 +18,14 @@ LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sc_hypervisor
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sc_hypervisor/include -I$(top_srcdir)/sc_hypervisor/examples
			
 
				 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS)
			
 
				 
			
 
				-if !NO_BLAS_LIB
			
 
				 noinst_PROGRAMS =				\
			
 
				-	cholesky/cholesky_implicit  		\
			
 
				 	app_driven_test/app_driven_test		\
			
 
				 	lp_test/lp_test
			
 
				 
			
 
				+if !NO_BLAS_LIB
			
 
				+noinst_PROGRAMS +=				\
			
 
				+	cholesky/cholesky_implicit  		
			
 
				+
			
 
				 noinst_HEADERS = 				\
			
 
				 	cholesky/cholesky.h			\
			
 
				 	sched_ctx_utils/sched_ctx_utils.h
			
@@ -42,13 +44,14 @@ cholesky_cholesky_implicit_LDADD =		\
 
				 	$(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la \
			
 
				 	$(STARPU_BLAS_LDFLAGS)
			
 
				 
			
 
				+endif
			
 
				+
			
 
				 app_driven_test_app_driven_test_SOURCES =		\
			
 
				 	app_driven_test/app_driven_test.c
			
 
				 
			
 
				 app_driven_test_app_driven_test_LDADD =		\
			
 
				 	$(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la
			
 
				 
			
 
				-endif
			
 
				 
			
 
				 showcheck:
			
 
				 	-cat $(TEST_LOGS) /dev/null
			
--- a/sc_hypervisor/include/sc_hypervisor_monitoring.h
+++ b/sc_hypervisor/include/sc_hypervisor_monitoring.h
@@ -50,6 +50,9 @@ struct sc_hypervisor_wrapper
 
				 	/* idle time of workers in this context */
			
 
				 	double current_idle_time[STARPU_NMAXWORKERS];
			
 
				 	
			
 
				+	double idle_time[STARPU_NMAXWORKERS];
			
 
				+	double idle_start_time[STARPU_NMAXWORKERS];
			
 
				+	
			
 
				 	/* list of workers that will leave this contexts (lazy resizing process) */
			
 
				 	int worker_to_be_removed[STARPU_NMAXWORKERS];
			
 
				 
			
--- a/sc_hypervisor/include/sc_hypervisor_policy.h
+++ b/sc_hypervisor/include/sc_hypervisor_policy.h
@@ -1,4 +1,4 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+/* StarPUf --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012  INRIA
			
 
				  *
			
@@ -27,6 +27,9 @@ extern "C"
 
				 
			
 
				 #define HYPERVISOR_REDIM_SAMPLE 0.02
			
 
				 #define HYPERVISOR_START_REDIM_SAMPLE 0.1
			
 
				+#define SC_NOTHING 0
			
 
				+#define SC_IDLE 1
			
 
				+#define SC_VELOCITY 2
			
 
				 
			
 
				 struct sc_hypervisor_policy_task_pool
			
 
				 {
			
@@ -94,6 +97,15 @@ void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_
 
				 /* check if we trigger resizing or not */
			
 
				 unsigned sc_hypervisor_criteria_fulfilled(unsigned sched_ctx, int worker);
			
 
				 
			
 
				+/* check if worker was idle long enough */
			
 
				+unsigned sc_hypervisor_check_idle(unsigned sched_ctx, int worker);
			
 
				+
			
 
				+/* check if there is a velocity gap btw ctxs */
			
 
				+unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void);
			
 
				+
			
 
				+/* check what triggers resizing (idle, velocity, etc.)*/
			
 
				+unsigned sc_hypervisor_get_resize_criteria();
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/sc_hypervisor/src/Makefile.am
+++ b/sc_hypervisor/src/Makefile.am
@@ -25,6 +25,7 @@ libsc_hypervisor_la_SOURCES = 				\
 
				 	sc_hypervisor.c					\
			
 
				 	sc_config.c					\
			
 
				 	policies_utils/policy_tools.c			\
			
 
				+	policies_utils/speed.c				\
			
 
				 	policies_utils/task_pool.c			\
			
 
				 	policies_utils/lp_tools.c			\
			
 
				 	policies_utils/lp_programs.c			\
			
--- a/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
+++ b/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
@@ -15,51 +15,63 @@
 
				  */
			
 
				 
			
 
				 #include "sc_hypervisor_lp.h"
			
 
				+#include "sc_hypervisor_policy.h"
			
 
				 #include <starpu_config.h>
			
 
				 #include <sys/time.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_GLPK_H
			
 
				-static void feft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				+static void _try_resizing(void)
			
 
				 {
			
 
				-	if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
			
 
				-	{
			
 
				-		int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
			
 
				+	/* for vite */
			
 
				+	starpu_trace_user_event(2);
			
 
				 
			
 
				-		double nworkers[nsched_ctxs][2];
			
 
				-
			
 
				-		int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
			
 
				-		if(ret != EBUSY)
			
 
				-		{
			
 
				-			int nw = 1;
			
 
				+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
			
 
				+	double nworkers[nsched_ctxs][2];
			
 
				+	int nw = 1;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-			int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
			
 
				-			nw = ncuda != 0 ? 2 : 1;
			
 
				+	int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
			
 
				+	nw = ncuda != 0 ? 2 : 1;
			
 
				 #endif
			
 
				-			int total_nw[nw];
			
 
				-			sc_hypervisor_group_workers_by_type(NULL, -1, nw, total_nw);
			
 
				-
			
 
				-
			
 
				-			struct timeval start_time;
			
 
				-			struct timeval end_time;
			
 
				-			gettimeofday(&start_time, NULL);
			
 
				-
			
 
				-			double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, nworkers, total_nw);
			
 
				-			gettimeofday(&end_time, NULL);
			
 
				-
			
 
				-			long diff_s = end_time.tv_sec  - start_time.tv_sec;
			
 
				-			long diff_us = end_time.tv_usec  - start_time.tv_usec;
			
 
				-
			
 
				-			float timing = (float)(diff_s*1000000 + diff_us)/1000;
			
 
				-
			
 
				-			if(vmax != 0.0)
			
 
				+	int total_nw[nw];
			
 
				+	sc_hypervisor_group_workers_by_type(NULL, -1, nw, total_nw);
			
 
				+	
			
 
				+	
			
 
				+	struct timeval start_time;
			
 
				+	struct timeval end_time;
			
 
				+	gettimeofday(&start_time, NULL);
			
 
				+	
			
 
				+	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, nworkers, total_nw);
			
 
				+	gettimeofday(&end_time, NULL);
			
 
				+	
			
 
				+	long diff_s = end_time.tv_sec  - start_time.tv_sec;
			
 
				+	long diff_us = end_time.tv_usec  - start_time.tv_usec;
			
 
				+	
			
 
				+	float timing = (float)(diff_s*1000000 + diff_us)/1000;
			
 
				+	
			
 
				+	if(vmax != 0.0)
			
 
				+	{
			
 
				+		int nworkers_rounded[nsched_ctxs][nw];
			
 
				+		sc_hypervisor_lp_round_double_to_int(nsched_ctxs, nw, nworkers, nworkers_rounded);
			
 
				+		sc_hypervisor_lp_redistribute_resources_in_ctxs(nsched_ctxs, nw, nworkers_rounded, nworkers);
			
 
				+	}
			
 
				+	
			
 
				+}
			
 
				+static void feft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				+{
			
 
				+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
			
 
				+	if(ret != EBUSY)
			
 
				+	{
			
 
				+		unsigned criteria = sc_hypervisor_get_resize_criteria();
			
 
				+		if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
			
 
				+		{
			
 
				+			if(sc_hypervisor_check_velocity_gap_btw_ctxs())
			
 
				 			{
			
 
				-				int nworkers_rounded[nsched_ctxs][nw];
			
 
				-				sc_hypervisor_lp_round_double_to_int(nsched_ctxs, nw, nworkers, nworkers_rounded);
			
 
				-				sc_hypervisor_lp_redistribute_resources_in_ctxs(nsched_ctxs, nw, nworkers_rounded, nworkers);
			
 
				+				_try_resizing();
			
 
				 			}
			
 
				-			starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 		}
			
 
				+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	}
			
 
				+
			
 
				 }
			
 
				 static void feft_lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworkers)
			
 
				 {
			
@@ -99,15 +111,13 @@ static void feft_lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworker
 
				 /* 				printf("ctx %d/worker type %d: n = %d \n", i, 1, nworkers_per_type_rounded[i][1]); */
			
 
				 /* #endif */
			
 
				 /* 		} */
			
 
				-		int *current_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : 
			
 
				-			sched_ctxs;
			
 
				+		int *current_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
			
 
				 
			
 
				 		unsigned has_workers = 0;
			
 
				 		int s;
			
 
				 		for(s = 0; s < ns; s++)
			
 
				 		{
			
 
				-			int nworkers_ctx = sc_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], 
			
 
				-									     STARPU_ANY_WORKER);
			
 
				+			int nworkers_ctx = sc_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], STARPU_ANY_WORKER);
			
 
				 			if(nworkers_ctx != 0)
			
 
				 			{
			
 
				 				has_workers = 1;
			
@@ -122,11 +132,30 @@ static void feft_lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworker
 
				 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 }
			
 
				 
			
 
				+static feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
			
 
				+{
			
 
				+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
			
 
				+	if(ret != EBUSY)
			
 
				+	{
			
 
				+		unsigned criteria = sc_hypervisor_get_resize_criteria();
			
 
				+		if(criteria != SC_NOTHING && criteria == SC_IDLE)
			
 
				+		{
			
 
				+			
			
 
				+			if(sc_hypervisor_check_idle(sched_ctx, worker))
			
 
				+			{
			
 
				+				_try_resizing();
			
 
				+//				sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
			
 
				+			}
			
 
				+		}
			
 
				+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 struct sc_hypervisor_policy feft_lp_policy = {
			
 
				 	.size_ctxs = feft_lp_size_ctxs,
			
 
				 	.handle_poped_task = feft_lp_handle_poped_task,
			
 
				 	.handle_pushed_task = NULL,
			
 
				-	.handle_idle_cycle = NULL,
			
 
				+	.handle_idle_cycle = feft_lp_handle_idle_cycle, //NULL,
			
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = NULL,
			
 
				 	.handle_submitted_job = NULL,
			
--- a/sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
+++ b/sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
@@ -346,7 +346,6 @@ static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct s
 
				 			for(i = 0; i < ns; i++)
			
 
				 				flops_on_w[i] = (double*)malloc(nw*sizeof(double));
			
 
				 
			
 
				-			printf("ns = %d nw = %d\n", ns, nw);
			
 
				 			unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, NULL, NULL);
			
 
				 			/* if we did find at least one solution redistribute the resources */
			
 
				 			if(found_sol)
			
--- a/sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c
+++ b/sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c
@@ -135,12 +135,16 @@ static void size_if_required()
 
				 		for(s = 0; s < nsched_ctxs; s++)
			
 
				 		{
			
 
				 			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				-			if(sc_w->submitted_flops < sc_w->total_flops)
			
 
				+//			if(sc_w->submitted_flops < sc_w->total_flops)
			
 
				+			if((sc_w->submitted_flops + (0.1*sc_w->total_flops)) < sc_w->total_flops)
			
 
				 				ready_to_size = 0;
			
 
				 		}
			
 
				 
			
 
				 		if(ready_to_size)
			
 
				+		{
			
 
				 			_size_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
			
 
				+			sc_hypervisor_free_size_req();
			
 
				+		}
			
 
				 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	}
			
 
				 }
			
@@ -155,6 +159,69 @@ static void teft_lp_handle_submitted_job(struct starpu_codelet *cl, unsigned sch
 
				 	size_if_required();
			
 
				 }
			
 
				 
			
 
				+static void _try_resizing(void)
			
 
				+{
			
 
				+	starpu_trace_user_event(2);
			
 
				+	int ns = sc_hypervisor_get_nsched_ctxs();
			
 
				+	int nw = starpu_worker_get_count(); /* Number of different workers */
			
 
				+	int nt = 0; /* Number of different kinds of tasks */
			
 
				+	
			
 
				+//			starpu_pthread_mutex_lock(&mutex);
			
 
				+	
			
 
				+	/* we don't take the mutex bc a correct value of the number of tasks is
			
 
				+	   not required but we do a copy in order to be sure
			
 
				+	   that the linear progr won't segfault if the list of 
			
 
				+	   submitted task will change during the exec */
			
 
				+	
			
 
				+	struct sc_hypervisor_policy_task_pool *tp = NULL;
			
 
				+	struct sc_hypervisor_policy_task_pool *tmp_task_pools = sc_hypervisor_policy_clone_task_pool(task_pools);
			
 
				+	
			
 
				+	for (tp = task_pools; tp; tp = tp->next)
			
 
				+		nt++;
			
 
				+	
			
 
				+	
			
 
				+	double w_in_s[ns][nw];
			
 
				+//			double tasks_per_worker[nw][nt];
			
 
				+	double **tasks_per_worker=(double**)malloc(nw*sizeof(double*));
			
 
				+	int i;
			
 
				+	for(i = 0; i < nw; i++)
			
 
				+		tasks_per_worker[i] = (double*)malloc(nt*sizeof(double));
			
 
				+	
			
 
				+	struct teft_lp_data specific_data;
			
 
				+	specific_data.nt = nt;
			
 
				+	specific_data.tasks = tasks_per_worker;
			
 
				+	specific_data.in_sched_ctxs = NULL;
			
 
				+	specific_data.workers = NULL;
			
 
				+	specific_data.tmp_task_pools = tmp_task_pools;
			
 
				+	specific_data.size_ctxs = 0;
			
 
				+
			
 
				+			/* smallest possible tmax, difficult to obtain as we
			
 
				+			   compute the nr of flops and not the tasks */
			
 
				+	double possible_tmax = sc_hypervisor_lp_get_tmax(nw, NULL);
			
 
				+	double smallest_tmax = possible_tmax / 3;
			
 
				+	double tmax = possible_tmax * ns;
			
 
				+	double tmin = smallest_tmax;
			
 
				+	unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
			
 
				+								tmin, tmax, smallest_tmax, _compute_workers_distrib);
			
 
				+//			starpu_pthread_mutex_unlock(&mutex);
			
 
				+	
			
 
				+	/* if we did find at least one solution redistribute the resources */
			
 
				+	if(found_sol)
			
 
				+		sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
			
 
				+	
			
 
				+	struct sc_hypervisor_policy_task_pool *next = NULL;
			
 
				+	struct sc_hypervisor_policy_task_pool *tmp_tp = tmp_task_pools;
			
 
				+	while(tmp_task_pools)
			
 
				+	{
			
 
				+		next = tmp_tp->next;
			
 
				+		free(tmp_tp);
			
 
				+		tmp_tp = next;
			
 
				+		tmp_task_pools = next;
			
 
				+	}
			
 
				+	for(i = 0; i < nw; i++)
			
 
				+		free(tasks_per_worker[i]);
			
 
				+	free(tasks_per_worker);
			
 
				+}
			
 
				 static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
			
 
				 {
			
 
				 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
			
@@ -162,74 +229,22 @@ static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct sta
 
				 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
			
 
				 	if(ret != EBUSY)
			
 
				 	{
			
 
				-		if(sc_w->submitted_flops < sc_w->total_flops)
			
 
				+		if((sc_w->submitted_flops + (0.1*sc_w->total_flops)) < sc_w->total_flops)
			
 
				 		{
			
 
				 			starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if(sc_hypervisor_criteria_fulfilled(sched_ctx, worker))
			
 
				+		unsigned criteria = sc_hypervisor_get_resize_criteria();
			
 
				+		if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
			
 
				 		{
			
 
				-			int ns = sc_hypervisor_get_nsched_ctxs();
			
 
				-			int nw = starpu_worker_get_count(); /* Number of different workers */
			
 
				-			int nt = 0; /* Number of different kinds of tasks */
			
 
				-
			
 
				-//			starpu_pthread_mutex_lock(&mutex);
			
 
				-
			
 
				-			/* we don't take the mutex bc a correct value of the number of tasks is
			
 
				-			   not required but we do a copy in order to be sure
			
 
				-			   that the linear progr won't segfault if the list of 
			
 
				-			   submitted task will change during the exec */
			
 
				-
			
 
				-			struct sc_hypervisor_policy_task_pool *tp = NULL;
			
 
				-			struct sc_hypervisor_policy_task_pool *tmp_task_pools = sc_hypervisor_policy_clone_task_pool(task_pools);
			
 
				-
			
 
				-			for (tp = task_pools; tp; tp = tp->next)
			
 
				-				nt++;
			
 
				-
			
 
				-
			
 
				-			double w_in_s[ns][nw];
			
 
				-//			double tasks_per_worker[nw][nt];
			
 
				-			double **tasks_per_worker=(double**)malloc(nw*sizeof(double*));
			
 
				-			int i;
			
 
				-			for(i = 0; i < nw; i++)
			
 
				-				tasks_per_worker[i] = (double*)malloc(nt*sizeof(double));
			
 
				-
			
 
				-			struct teft_lp_data specific_data;
			
 
				-			specific_data.nt = nt;
			
 
				-			specific_data.tasks = tasks_per_worker;
			
 
				-			specific_data.in_sched_ctxs = NULL;
			
 
				-			specific_data.workers = NULL;
			
 
				-			specific_data.tmp_task_pools = tmp_task_pools;
			
 
				-			specific_data.size_ctxs = 0;
			
 
				-
			
 
				-			/* smallest possible tmax, difficult to obtain as we
			
 
				-			   compute the nr of flops and not the tasks */
			
 
				-			double possible_tmax = sc_hypervisor_lp_get_tmax(nw, NULL);
			
 
				-			double smallest_tmax = possible_tmax / 3;
			
 
				-			double tmax = possible_tmax * ns;
			
 
				-			double tmin = smallest_tmax;
			
 
				-			unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
			
 
				-								tmin, tmax, smallest_tmax, _compute_workers_distrib);
			
 
				-//			starpu_pthread_mutex_unlock(&mutex);
			
 
				-
			
 
				-			/* if we did find at least one solution redistribute the resources */
			
 
				-			if(found_sol)
			
 
				-				sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
			
 
				-
			
 
				-			struct sc_hypervisor_policy_task_pool *next = NULL;
			
 
				-			struct sc_hypervisor_policy_task_pool *tmp_tp = tmp_task_pools;
			
 
				-			while(tmp_task_pools)
			
 
				+			
			
 
				+			if(sc_hypervisor_check_velocity_gap_btw_ctxs())
			
 
				 			{
			
 
				-				next = tmp_tp->next;
			
 
				-				free(tmp_tp);
			
 
				-				tmp_tp = next;
			
 
				-				tmp_task_pools = next;
			
 
				+				_try_resizing();
			
 
				 			}
			
 
				-			for(i = 0; i < nw; i++)
			
 
				-				free(tasks_per_worker[i]);
			
 
				-			free(tasks_per_worker);
			
 
				 		}
			
 
				+
			
 
				 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	}
			
 
				 	/* too expensive to take this mutex and correct value of the number of tasks is not compulsory */
			
@@ -239,6 +254,34 @@ static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct sta
 
				 
			
 
				 }
			
 
				 
			
 
				+static int teft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
			
 
				+{
			
 
				+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
			
 
				+
			
 
				+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
			
 
				+	if(ret != EBUSY)
			
 
				+	{
			
 
				+		if((sc_w->submitted_flops + (0.1*sc_w->total_flops)) < sc_w->total_flops)
			
 
				+		{
			
 
				+			starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				+			return;
			
 
				+		}
			
 
				+
			
 
				+
			
 
				+		unsigned criteria = sc_hypervisor_get_resize_criteria();
			
 
				+		if(criteria != SC_NOTHING && criteria == SC_IDLE)
			
 
				+		{
			
 
				+			
			
 
				+			if(sc_hypervisor_check_idle(sched_ctx, worker))
			
 
				+			{
			
 
				+				_try_resizing();
			
 
				+//				sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
			
 
				+			}
			
 
				+		}
			
 
				+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				 static void teft_lp_size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
			
 
				 {
			
@@ -249,7 +292,7 @@ struct sc_hypervisor_policy teft_lp_policy = {
 
				 	.size_ctxs = teft_lp_size_ctxs,
			
 
				 	.handle_poped_task = teft_lp_handle_poped_task,
			
 
				 	.handle_pushed_task = NULL,
			
 
				-	.handle_idle_cycle = NULL,
			
 
				+	.handle_idle_cycle = teft_lp_handle_idle_cycle,
			
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = NULL,
			
 
				 	.handle_submitted_job = teft_lp_handle_submitted_job,
			
--- a/sc_hypervisor/src/policies_utils/lp_tools.c
+++ b/sc_hypervisor/src/policies_utils/lp_tools.c
@@ -17,6 +17,7 @@
 
				 #include <math.h>
			
 
				 #include "sc_hypervisor_lp.h"
			
 
				 #include "sc_hypervisor_policy.h"
			
 
				+#include "sc_hypervisor_intern.h"
			
 
				 #include <starpu_config.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_GLPK_H
			
@@ -48,11 +49,26 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 
				 #else
			
 
				 		v[i][0] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
			
 
				 #endif // STARPU_USE_CUDA
			
 
				-		flops[i] = sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
			
 
				+		
			
 
				+		flops[i] = sc_w->remaining_flops < 0.0 ? 0.0 : sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
			
 
				 //		printf("%d: flops %lf\n", sched_ctxs[i], flops[i]);
			
 
				 	}
			
 
				 
			
 
				-	return 1/sc_hypervisor_lp_simulate_distrib_flops(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw);
			
 
				+	double vmax = 1/sc_hypervisor_lp_simulate_distrib_flops(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw);
			
 
				+	double optimal_v = 0.0;
			
 
				+	for(i = 0; i < nsched_ctxs; i++)
			
 
				+	{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		optimal_v = res[i][0] * v[i][0] + res[i][1]* v[i][1];
			
 
				+#else
			
 
				+		optimal_v = res[i][0] * v[i][0];
			
 
				+#endif //STARPU_USE_CUDA
			
 
				+//				printf("%d: set opt %lf\n", i, optimal_v[i]);
			
 
				+		if(optimal_v != 0.0)
			
 
				+			_set_optimal_v(i, optimal_v);
			
 
				+	}
			
 
				+
			
 
				+	return vmax;
			
 
				 #else//STARPU_HAVE_GLPK_H
			
 
				 	return 0.0;
			
 
				 #endif//STARPU_HAVE_GLPK_H
			
--- a/sc_hypervisor/src/policies_utils/policy_tools.c
+++ b/sc_hypervisor/src/policies_utils/policy_tools.c
@@ -16,6 +16,7 @@
 
				 
			
 
				 #include "sc_hypervisor_policy.h"
			
 
				 #include "sc_hypervisor_intern.h"
			
 
				+#include "sc_hypervisor_lp.h"
			
 
				 #include <math.h>
			
 
				 
			
 
				 static int _compute_priority(unsigned sched_ctx)
			
@@ -348,32 +349,6 @@ static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
 
				 	return ispeed_sample;
			
 
				 }
			
 
				 
			
 
				-double sc_hypervisor_get_ctx_velocity(struct sc_hypervisor_wrapper* sc_w)
			
 
				-{
			
 
				-	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
			
 
				-        double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
			
 
				-	double sample = _get_ispeed_sample_for_sched_ctx(sc_w->sched_ctx);
			
 
				-
			
 
				-/* 	double total_elapsed_flops = sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(sc_w); */
			
 
				-/* 	double prc = config->ispeed_ctx_sample != 0.0 ? elapsed_flops : elapsed_flops/sc_w->total_flops; */
			
 
				-/* 	double redim_sample = config->ispeed_ctx_sample != 0.0 ? config->ispeed_ctx_sample :  */
			
 
				-/* 		(elapsed_flops == total_elapsed_flops ? HYPERVISOR_START_REDIM_SAMPLE : HYPERVISOR_REDIM_SAMPLE); */
			
 
				-//	printf("%d: prc %lf sample %lf\n", sc_w->sched_ctx, prc, redim_sample);
			
 
				-
			
 
				-/* 	double curr_time2 = starpu_timing_now(); */
			
 
				-/* 	double elapsed_time2 = (curr_time2 - sc_w->start_time) / 1000000.0; /\* in seconds *\/ */
			
 
				-/* 	if(elapsed_time2 > 5.0 && elapsed_flops < sample) */
			
 
				-/* 		return (elapsed_flops/1000000000.0)/elapsed_time2;/\* in Gflops/s *\/ */
			
 
				-
			
 
				-	if(elapsed_flops >= sample)
			
 
				-        {
			
 
				-                double curr_time = starpu_timing_now();
			
 
				-                double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				-                return (elapsed_flops/1000000000.0)/elapsed_time;/* in Gflops/s */
			
 
				-        }
			
 
				-	return -1.0;
			
 
				-}
			
 
				-
			
 
				 double sc_hypervisor_get_slowest_ctx_exec_time(void)
			
 
				 {
			
 
				 	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
			
@@ -423,150 +398,6 @@ double sc_hypervisor_get_fastest_ctx_exec_time(void)
 
				 	return fastest_time;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker)
			
 
				-{
			
 
				-	if(!starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx))
			
 
				-		return -1.0;
			
 
				-
			
 
				-        double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
			
 
				-	size_t elapsed_data_used = sc_w->elapsed_data[worker];
			
 
				-	int elapsed_tasks = sc_w->elapsed_tasks[worker];
			
 
				-	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
			
 
				-	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
			
 
				-
			
 
				-	double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
			
 
				-	double ctx_sample = config->ispeed_ctx_sample;
			
 
				-	if(ctx_elapsed_flops > ctx_sample && elapsed_flops == 0.0)
			
 
				-		return 0.00000000000001;
			
 
				-
			
 
				-/*         if( elapsed_flops >= sample) */
			
 
				-/*         { */
			
 
				-/*                 double curr_time = starpu_timing_now(); */
			
 
				-/*                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /\* in seconds *\/ */
			
 
				-/* 		sc_w->ref_velocity[worker] = (elapsed_flops/elapsed_time); /\* in Gflops/s *\/ */
			
 
				-/*                 return sc_w->ref_velocity[worker]; */
			
 
				-/*         } */
			
 
				-
			
 
				-/*         return -1.0; */
			
 
				-
			
 
				-        if( elapsed_flops != 0.0)
			
 
				-        {
			
 
				-                double curr_time = starpu_timing_now();
			
 
				-		size_t elapsed_data_used = sc_w->elapsed_data[worker];
			
 
				-                double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				- 		enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
			
 
				-		if(arch == STARPU_CUDA_WORKER)
			
 
				-		{
			
 
				-/* 			unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx); */
			
 
				-/* 			if(!worker_in_ctx) */
			
 
				-/* 			{ */
			
 
				-
			
 
				-/* 				double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker); */
			
 
				-/* 				elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ; */
			
 
				-/* 			} */
			
 
				-			double latency = starpu_get_latency_RAM_CUDA(worker);
			
 
				-//			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks);
			
 
				-			elapsed_time += (elapsed_tasks * latency)/1000000;
			
 
				-//			printf("elapsed time after %lf \n", elapsed_time);
			
 
				-		}
			
 
				-			
			
 
				-                double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
			
 
				-		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 1.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
			
 
				-                return vel;
			
 
				-        }
			
 
				-
			
 
				-        return 0.00000000000001;
			
 
				-
			
 
				-
			
 
				-}
			
 
				-
			
 
				-static double _get_best_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *npus, enum starpu_worker_archtype req_arch)
			
 
				-{
			
 
				-	double ret_val = 0.0;
			
 
				-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				-        int worker;
			
 
				-
			
 
				-	struct starpu_sched_ctx_iterator it;
			
 
				-	if(workers->init_iterator)
			
 
				-                workers->init_iterator(workers, &it);
			
 
				-
			
 
				-        while(workers->has_next(workers, &it))
			
 
				-	{
			
 
				-                worker = workers->get_next(workers, &it);
			
 
				-                enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
			
 
				-                if(arch == req_arch)
			
 
				-                {
			
 
				-			if(sc_w->elapsed_flops[worker] > ret_val)
			
 
				-				ret_val = sc_w->elapsed_flops[worker];
			
 
				-			(*npus)++;
			
 
				-                }
			
 
				-        }
			
 
				-
			
 
				-	return ret_val;
			
 
				-}
			
 
				-
			
 
				-/* compute an average value of the cpu/cuda velocity */
			
 
				-double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
			
 
				-{
			
 
				-        int npus = 0;
			
 
				-        double elapsed_flops = _get_best_elapsed_flops(sc_w, &npus, arch) / 1000000000.0 ; /* in gflops */
			
 
				-	if(npus == 0)
			
 
				-		return -1.0; 
			
 
				-
			
 
				-        if( elapsed_flops != 0.0)
			
 
				-        {
			
 
				-                double curr_time = starpu_timing_now();
			
 
				-                double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				-		double velocity = (elapsed_flops/elapsed_time); /* in Gflops/s */
			
 
				-                return velocity;
			
 
				-        }
			
 
				-
			
 
				-        return -1.0;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-/* check if there is a big velocity gap between the contexts */
			
 
				-unsigned _check_velocity_gap_btw_ctxs()
			
 
				-{
			
 
				-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
			
 
				-	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
			
 
				-	int i = 0, j = 0;
			
 
				-	struct sc_hypervisor_wrapper* sc_w;
			
 
				-	struct sc_hypervisor_wrapper* other_sc_w;
			
 
				-
			
 
				-	for(i = 0; i < nsched_ctxs; i++)
			
 
				-	{
			
 
				-		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
			
 
				-		double ctx_v = sc_hypervisor_get_ctx_velocity(sc_w);
			
 
				-		if(ctx_v != -1.0)
			
 
				-		{
			
 
				-			for(j = 0; j < nsched_ctxs; j++)
			
 
				-			{
			
 
				-				if(sched_ctxs[i] != sched_ctxs[j])
			
 
				-				{
			
 
				-					unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctxs[j]);
			
 
				-					if(nworkers == 0) 
			
 
				-						return 1;
			
 
				-
			
 
				-					other_sc_w = sc_hypervisor_get_wrapper(sched_ctxs[j]);
			
 
				-					double other_ctx_v = sc_hypervisor_get_ctx_velocity(other_sc_w);
			
 
				-					if(other_ctx_v != -1.0)
			
 
				-					{
			
 
				-						double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v ;
			
 
				-//						if(gap > 1.5)
			
 
				-						if(gap > _get_max_velocity_gap())
			
 
				-							return 1;
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-	}
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-
			
 
				 void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers])
			
 
				 {
			
 
				 	int current_nworkers = workers == NULL ? starpu_worker_get_count() : nworkers;
			
@@ -629,32 +460,143 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 
				         }
			
 
				 }
			
 
				 
			
 
				-static unsigned _check_idle(unsigned sched_ctx, int worker)
			
 
				+unsigned sc_hypervisor_check_idle(unsigned sched_ctx, int worker)
			
 
				 {
			
 
				 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
			
 
				 	struct sc_hypervisor_policy_config *config = sc_w->config;
			
 
				 	if(config != NULL)
			
 
				 	{
			
 
				-		int j;
			
 
				-		for(j = 0; j < STARPU_NMAXWORKERS; j++)
			
 
				+		if(sc_w->current_idle_time[worker] > config->max_idle[worker])
			
 
				 		{
			
 
				-			if(sc_w->current_idle_time[j] > config->max_idle[j])
			
 
				-				return 1;
			
 
				+			sc_w->current_idle_time[worker] = 0.0;
			
 
				+			return 1;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* check if there is a big velocity gap between the contexts */
			
 
				+unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
			
 
				+{
			
 
				+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
			
 
				+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
			
 
				+	int i = 0, j = 0;
			
 
				+	struct sc_hypervisor_wrapper* sc_w;
			
 
				+	struct sc_hypervisor_wrapper* other_sc_w;
			
 
				+
			
 
				+	
			
 
				+	double optimal_v[nsched_ctxs];
			
 
				+	unsigned has_opt_v = 1;
			
 
				+	for(i = 0; i < nsched_ctxs; i++)
			
 
				+	{
			
 
				+		optimal_v[i] = _get_optimal_v(i);
			
 
				+		if(optimal_v[i] == 0.0)
			
 
				+		{
			
 
				+			has_opt_v = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if(!has_opt_v)
			
 
				+	{
			
 
				+		int nw = 1;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
			
 
				+		nw = ncuda != 0 ? 2 : 1;
			
 
				+#endif	
			
 
				+		double nworkers_per_type[nsched_ctxs][nw];
			
 
				+		int total_nw[nw];
			
 
				+		for(i = 0; i < nw; i++)
			
 
				+		{
			
 
				+			for(j = 0; j < nsched_ctxs; j++)
			
 
				+				nworkers_per_type[j][i] = 0.0;
			
 
				+			total_nw[i] = 0;
			
 
				+		}
			
 
				+		sc_hypervisor_group_workers_by_type(NULL, -1, nw, total_nw);
			
 
				+		
			
 
				+		double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, nworkers_per_type, total_nw);
			
 
				+		
			
 
				+		if(vmax != 0.0)
			
 
				+		{
			
 
				+			for(i = 0; i < nsched_ctxs; i++)
			
 
				+			{
			
 
				+				sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
			
 
				+				double v[nw];
			
 
				+				v[0] = sc_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
			
 
				+				v[1] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
			
 
				+				
			
 
				+				optimal_v[i] = nworkers_per_type[i][0] * v[0] + nworkers_per_type[i][1]* v[1];
			
 
				+				_set_optimal_v(i, optimal_v[i]);
			
 
				+			}
			
 
				+			has_opt_v = 1;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if(has_opt_v)
			
 
				+	{
			
 
				+		for(i = 0; i < nsched_ctxs; i++)
			
 
				+		{
			
 
				+			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
			
 
				+			
			
 
				+			double ctx_v = sc_hypervisor_get_ctx_velocity(sc_w);
			
 
				+			if(ctx_v == -1.0)
			
 
				+				return 0;
			
 
				+		}
			
 
				+
			
 
				+		for(i = 0; i < nsched_ctxs; i++)
			
 
				+		{
			
 
				+			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
			
 
				+			
			
 
				+			double ctx_v = sc_hypervisor_get_ctx_velocity(sc_w);
			
 
				+			if(ctx_v != -1.0 && ((ctx_v < 0.8*optimal_v[i]) || ctx_v > 1.2*optimal_v[i])) 
			
 
				+				return 1;
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for(i = 0; i < nsched_ctxs; i++)
			
 
				+		{
			
 
				+			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
			
 
				+			double ctx_v = sc_hypervisor_get_ctx_velocity(sc_w);
			
 
				+			if(ctx_v != -1.0)
			
 
				+			{
			
 
				+				for(j = 0; j < nsched_ctxs; j++)
			
 
				+				{
			
 
				+					if(sched_ctxs[i] != sched_ctxs[j])
			
 
				+					{
			
 
				+						unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctxs[j]);
			
 
				+						if(nworkers == 0)
			
 
				+							return 1;
			
 
				+						
			
 
				+						other_sc_w = sc_hypervisor_get_wrapper(sched_ctxs[j]);
			
 
				+						double other_ctx_v = sc_hypervisor_get_ctx_velocity(other_sc_w);
			
 
				+						if(other_ctx_v != -1.0)
			
 
				+						{
			
 
				+							double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v;
			
 
				+							double max_vel = _get_max_velocity_gap();
			
 
				+							if(gap > max_vel-1 && gap < max_vel+1)
			
 
				+								return 1;
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+			
			
 
				+		}
			
 
				+	}
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+
			
 
				 unsigned sc_hypervisor_criteria_fulfilled(unsigned sched_ctx, int worker)
			
 
				 {
			
 
				-	unsigned criteria = _get_resize_criteria();
			
 
				+	unsigned criteria = sc_hypervisor_get_resize_criteria();
			
 
				 	if(criteria != SC_NOTHING)
			
 
				 	{
			
 
				 		if(criteria == SC_IDLE)
			
 
				-			return _check_idle(sched_ctx, worker);
			
 
				+			return sc_hypervisor_check_idle(sched_ctx, worker);
			
 
				 		else
			
 
				-			return _check_velocity_gap_btw_ctxs();
			
 
				+			return sc_hypervisor_check_velocity_gap_btw_ctxs();
			
 
				 	}
			
 
				 	else
			
 
				 		return 0;
			
--- a/sc_hypervisor/src/policies_utils/speed.c
+++ b/sc_hypervisor/src/policies_utils/speed.c
@@ -0,0 +1,168 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2013  INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "sc_hypervisor_policy.h"
			
 
				+#include "sc_hypervisor_intern.h"
			
 
				+#include <math.h>
			
 
				+
			
 
				+
			
 
				+double sc_hypervisor_get_ctx_velocity(struct sc_hypervisor_wrapper* sc_w)
			
 
				+{
			
 
				+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
			
 
				+        double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
			
 
				+	double sample = config->ispeed_ctx_sample;
			
 
				+	
			
 
				+
			
 
				+	double total_elapsed_flops = sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(sc_w);
			
 
				+	double total_flops = sc_w->total_flops;
			
 
				+
			
 
				+	char *start_sample_prc_char = getenv("SC_HYPERVISOR_START_RESIZE");
			
 
				+	double start_sample_prc = start_sample_prc_char ? atof(start_sample_prc_char) : 0.0;
			
 
				+	double start_sample = start_sample_prc > 0.0 ? (start_sample_prc / 100) * total_flops : sample;
			
 
				+	double redim_sample = elapsed_flops == total_elapsed_flops ? (start_sample > 0.0 ? start_sample : sample) : sample;
			
 
				+
			
 
				+	if(elapsed_flops >= redim_sample)
			
 
				+        {
			
 
				+                double curr_time = starpu_timing_now();
			
 
				+                double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				+                return (elapsed_flops/1000000000.0)/elapsed_time;/* in Gflops/s */
			
 
				+        }
			
 
				+	return -1.0;
			
 
				+}
			
 
				+
			
 
				+double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker)
			
 
				+{
			
 
				+	if(!starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx))
			
 
				+		return -1.0;
			
 
				+
			
 
				+        double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
			
 
				+
			
 
				+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
			
 
				+	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
			
 
				+
			
 
				+	double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
			
 
				+	double ctx_sample = config->ispeed_ctx_sample;
			
 
				+	if(ctx_elapsed_flops > ctx_sample && elapsed_flops == 0.0)
			
 
				+		return 0.00000000000001;
			
 
				+
			
 
				+
			
 
				+        if( elapsed_flops > sample)
			
 
				+        {
			
 
				+                double curr_time = starpu_timing_now();
			
 
				+                double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				+		elapsed_time -= sc_w->idle_time[worker];
			
 
				+		sc_w->idle_time[worker] = 0.0;
			
 
				+
			
 
				+/* 		size_t elapsed_data_used = sc_w->elapsed_data[worker]; */
			
 
				+/*  		enum starpu_worker_archtype arch = starpu_worker_get_type(worker); */
			
 
				+/* 		if(arch == STARPU_CUDA_WORKER) */
			
 
				+/* 		{ */
			
 
				+/* /\* 			unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx); *\/ */
			
 
				+/* /\* 			if(!worker_in_ctx) *\/ */
			
 
				+/* /\* 			{ *\/ */
			
 
				+
			
 
				+/* /\* 				double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker); *\/ */
			
 
				+/* /\* 				elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ; *\/ */
			
 
				+/* /\* 			} *\/ */
			
 
				+/* 			double latency = starpu_get_latency_RAM_CUDA(worker); */
			
 
				+/* //			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks); */
			
 
				+/* 			elapsed_time += (elapsed_tasks * latency)/1000000; */
			
 
				+/* //			printf("elapsed time after %lf \n", elapsed_time); */
			
 
				+/* 		} */
			
 
				+			
			
 
				+                double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
			
 
				+//		printf("%d in ctx %d: vel %lf\n", worker, sc_w->sched_ctx, vel);
			
 
				+		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 1.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
			
 
				+                return vel;
			
 
				+        }
			
 
				+
			
 
				+        return -1.0;
			
 
				+
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* compute an average value of the cpu/cuda velocity */
			
 
				+double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
			
 
				+{
			
 
				+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				+        int worker;
			
 
				+
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				+	if(workers->init_iterator)
			
 
				+                workers->init_iterator(workers, &it);
			
 
				+
			
 
				+	double velocity = 0.0;
			
 
				+	unsigned nworkers = 0;
			
 
				+        while(workers->has_next(workers, &it))
			
 
				+	{
			
 
				+                worker = workers->get_next(workers, &it);
			
 
				+                enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
			
 
				+                if(arch == req_arch)
			
 
				+                {
			
 
				+			double _vel = sc_hypervisor_get_velocity_per_worker(sc_w, worker);
			
 
				+			if(_vel == -1.0) return -1.0;
			
 
				+			velocity += _vel;
			
 
				+			nworkers++;
			
 
				+		}
			
 
				+	}
			
 
				+			
			
 
				+
			
 
				+        return (nworkers != 0 ? velocity / nworkers : -1.0);
			
 
				+}
			
 
				+
			
 
				+/* compute an average value of the cpu/cuda old velocity */
			
 
				+double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
			
 
				+{
			
 
				+	double ref_velocity = 0.0;
			
 
				+	unsigned nw = 0;
			
 
				+
			
 
				+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				+	int worker;
			
 
				+
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				+	if(workers->init_iterator)
			
 
				+		workers->init_iterator(workers, &it);
			
 
				+
			
 
				+	while(workers->has_next(workers, &it))
			
 
				+	{
			
 
				+		worker = workers->get_next(workers, &it);
			
 
				+                enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
			
 
				+                if(arch == req_arch)
			
 
				+                {
			
 
				+		
			
 
				+			if(sc_w->ref_velocity[worker] > 1.0)
			
 
				+			{
			
 
				+				ref_velocity += sc_w->ref_velocity[worker];
			
 
				+				nw++;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	
			
 
				+	return (nw != 0 ? ref_velocity / nw : -1.0);
			
 
				+}
			
 
				+
			
 
				+double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch)
			
 
				+{
			
 
				+
			
 
				+	double velocity = sc_hypervisor_get_velocity_per_worker_type(sc_w, arch);
			
 
				+	if(velocity == -1.0)
			
 
				+		velocity = sc_hypervisor_get_ref_velocity_per_worker_type(sc_w, arch);
			
 
				+	if(velocity == -1.0)
			
 
				+		velocity = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
			
 
				+       
			
 
				+	return velocity;
			
 
				+}
			
--- a/sc_hypervisor/src/sc_hypervisor.c
+++ b/sc_hypervisor/src/sc_hypervisor.c
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <sc_hypervisor_intern.h>
			
 
				+#include <sc_hypervisor_policy.h>
			
 
				 #include <common/uthash.h>
			
 
				 #include <starpu_config.h>
			
 
				 
			
@@ -112,7 +113,7 @@ static struct sc_hypervisor_policy *_select_hypervisor_policy(struct sc_hypervis
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		policy_name = getenv("HYPERVISOR_POLICY");
			
 
				+		policy_name = getenv("SC_HYPERVISOR_POLICY");
			
 
				 	}
			
 
				 
			
 
				 	if (policy_name)
			
@@ -133,10 +134,10 @@ struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hyper
 
				 {
			
 
				 	hypervisor.min_tasks = 0;
			
 
				 	hypervisor.nsched_ctxs = 0;
			
 
				-	char* vel_gap = getenv("MAX_VELOCITY_GAP");
			
 
				+	char* vel_gap = getenv("SC_HYPERVISOR_MAX_VELOCITY_GAP");
			
 
				 	hypervisor.max_velocity_gap = vel_gap ? atof(vel_gap) : SC_VELOCITY_MAX_GAP_DEFAULT;
			
 
				-	char* crit =  getenv("HYPERVISOR_TRIGGER_RESIZE");
			
 
				-	hypervisor.resize_criteria = strcmp(crit,"idle") == 0 ? SC_IDLE : (strcmp(crit,"speed") == 0 ? SC_SPEED : SC_NOTHING);
			
 
				+	char* crit =  getenv("SC_HYPERVISOR_TRIGGER_RESIZE");
			
 
				+	hypervisor.resize_criteria = !crit ? SC_IDLE : strcmp(crit,"idle") == 0 ? SC_IDLE : (strcmp(crit,"speed") == 0 ? SC_VELOCITY : SC_NOTHING);
			
 
				 
			
 
				 	starpu_pthread_mutex_init(&act_hypervisor_mutex, NULL);
			
 
				 	hypervisor.start_executing_time = starpu_timing_now();
			
@@ -161,11 +162,14 @@ struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hyper
 
				 		hypervisor.sched_ctx_w[i].resize_ack.nmoved_workers = 0;
			
 
				 		hypervisor.sched_ctx_w[i].resize_ack.acked_workers = NULL;
			
 
				 		starpu_pthread_mutex_init(&hypervisor.sched_ctx_w[i].mutex, NULL);
			
 
				+		hypervisor.optimal_v[i] = 0.0;
			
 
				 
			
 
				 		int j;
			
 
				 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
			
 
				 		{
			
 
				 			hypervisor.sched_ctx_w[i].current_idle_time[j] = 0.0;
			
 
				+			hypervisor.sched_ctx_w[i].idle_time[j] = 0.0;
			
 
				+			hypervisor.sched_ctx_w[i].idle_start_time[j] = 0.0;
			
 
				 			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
			
@@ -215,7 +219,7 @@ void sc_hypervisor_start_resize(unsigned sched_ctx)
 
				 
			
 
				 static void _print_current_time()
			
 
				 {
			
 
				-	if(!getenv("HYPERVISOR_STOP_PRINT"))
			
 
				+	if(!getenv("SC_HYPERVISOR_STOP_PRINT"))
			
 
				 	{
			
 
				 		double curr_time = starpu_timing_now();
			
 
				 		double elapsed_time = (curr_time - hypervisor.start_executing_time) / 1000000.0; /* in seconds */
			
@@ -347,88 +351,17 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 
				 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 }
			
 
				 
			
 
				-static double _get_best_total_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *npus, enum starpu_worker_archtype req_arch)
			
 
				-{
			
 
				-	double ret_val = 0.0;
			
 
				-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				-        int worker;
			
 
				-
			
 
				-	struct starpu_sched_ctx_iterator it;
			
 
				-	if(workers->init_iterator)
			
 
				-                workers->init_iterator(workers, &it);
			
 
				-
			
 
				-        while(workers->has_next(workers, &it))
			
 
				-	{
			
 
				-                worker = workers->get_next(workers, &it);
			
 
				-                enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
			
 
				-                if(arch == req_arch)
			
 
				-                {
			
 
				-			if(sc_w->total_elapsed_flops[worker] > ret_val)
			
 
				-				ret_val = sc_w->total_elapsed_flops[worker];
			
 
				-			(*npus)++;
			
 
				-                }
			
 
				-        }
			
 
				-
			
 
				-	return ret_val;
			
 
				-}
			
 
				 
			
 
				 double _get_max_velocity_gap()
			
 
				 {
			
 
				 	return hypervisor.max_velocity_gap;
			
 
				 }
			
 
				 
			
 
				-unsigned _get_resize_criteria()
			
 
				+unsigned sc_hypervisor_get_resize_criteria()
			
 
				 {
			
 
				 	return hypervisor.resize_criteria;
			
 
				 }
			
 
				 
			
 
				-/* compute an average value of the cpu/cuda velocity */
			
 
				-double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
			
 
				-{
			
 
				-        int npus = 0;
			
 
				-        double elapsed_flops = _get_best_total_elapsed_flops(sc_w, &npus, arch) / 1000000000.0 ; /* in gflops */
			
 
				-	if(npus == 0)
			
 
				-		return -1.0; 
			
 
				-
			
 
				-        if( elapsed_flops != 0.0)
			
 
				-        {
			
 
				-                double curr_time = starpu_timing_now();
			
 
				-                double elapsed_time = (curr_time - sc_w->real_start_time) / 1000000.0; /* in seconds */
			
 
				-		double velocity = (elapsed_flops/elapsed_time); /* in Gflops/s */
			
 
				-                return velocity;
			
 
				-        }
			
 
				-
			
 
				-        return -1.0;
			
 
				-}
			
 
				-
			
 
				-/* compute an average value of the cpu/cuda old velocity */
			
 
				-double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
			
 
				-{
			
 
				-	double ref_velocity = 0.0;
			
 
				-	unsigned nw = 0;
			
 
				-
			
 
				-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				-	int worker;
			
 
				-
			
 
				-	struct starpu_sched_ctx_iterator it;
			
 
				-	if(workers->init_iterator)
			
 
				-		workers->init_iterator(workers, &it);
			
 
				-
			
 
				-	while(workers->has_next(workers, &it))
			
 
				-	{
			
 
				-		worker = workers->get_next(workers, &it);
			
 
				-		if(sc_w->ref_velocity[worker] > 1.0)
			
 
				-		{
			
 
				-			ref_velocity += sc_w->ref_velocity[worker];
			
 
				-			nw++;
			
 
				-		}
			
 
				-	}
			
 
				-	
			
 
				-	if(nw > 0)
			
 
				-		return ref_velocity / nw;
			
 
				-	return -1.0;
			
 
				-}
			
 
				-
			
 
				 static int get_ntasks( int *tasks)
			
 
				 {
			
 
				 	int ntasks = 0;
			
@@ -507,7 +440,6 @@ double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_
 
				 	return ret_val;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sched_ctx)
			
 
				 {
			
 
				 	/* info concerning only the gflops_rate strateg */
			
@@ -534,7 +466,7 @@ void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sch
 
				 		for(j = 0; j < nworkers_to_move; j++)
			
 
				 			printf(" %d", workers_to_move[j]);
			
 
				 		printf("\n");
			
 
				-
			
 
				+		starpu_trace_user_event(1);
			
 
				 		hypervisor.allow_remove[receiver_sched_ctx] = 0;
			
 
				 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
			
 
				 
			
@@ -792,6 +724,15 @@ static void notify_idle_end(unsigned sched_ctx, int worker)
 
				 	if(hypervisor.resize[sched_ctx])
			
 
				 		hypervisor.sched_ctx_w[sched_ctx].current_idle_time[worker] = 0.0;
			
 
				 
			
 
				+	struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
			
 
				+
			
 
				+	if(sc_w->idle_start_time[worker] != 0.0)
			
 
				+	{
			
 
				+		double end_time  = starpu_timing_now();
			
 
				+		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */ 
			
 
				+		sc_w->idle_start_time[worker] = 0.0;
			
 
				+	}
			
 
				+
			
 
				 	if(hypervisor.policy.handle_idle_end)
			
 
				 		hypervisor.policy.handle_idle_end(sched_ctx, worker);
			
 
				 
			
@@ -804,6 +745,10 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 
				 	{
			
 
				 		struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
			
 
				 		sc_w->current_idle_time[worker] += idle_time;
			
 
				+
			
 
				+		if(sc_w->idle_start_time[worker] == 0.0)
			
 
				+			sc_w->idle_start_time[worker] = starpu_timing_now();
			
 
				+
			
 
				 		if(hypervisor.policy.handle_idle_cycle)
			
 
				 		{
			
 
				 			hypervisor.policy.handle_idle_cycle(sched_ctx, worker);
			
@@ -993,14 +938,12 @@ void sc_hypervisor_free_size_req(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch)
			
 
				+double _get_optimal_v(unsigned sched_ctx)
			
 
				 {
			
 
				+	return hypervisor.optimal_v[sched_ctx];
			
 
				+}
			
 
				 
			
 
				-	double velocity = sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(sc_w, arch);
			
 
				-	if(velocity == -1.0)
			
 
				-		velocity = sc_hypervisor_get_ref_velocity_per_worker_type(sc_w, arch);
			
 
				-	if(velocity == -1.0)
			
 
				-		velocity = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
			
 
				-       
			
 
				-	return velocity;
			
 
				+void _set_optimal_v(unsigned sched_ctx, double optimal_v)
			
 
				+{
			
 
				+	hypervisor.optimal_v[sched_ctx] = optimal_v;
			
 
				 }
			
--- a/sc_hypervisor/src/sc_hypervisor_intern.h
+++ b/sc_hypervisor/src/sc_hypervisor_intern.h
@@ -18,9 +18,7 @@
 
				 #include <common/uthash.h>
			
 
				 
			
 
				 #define SC_VELOCITY_MAX_GAP_DEFAULT 50
			
 
				-#define SC_NOTHING 0
			
 
				-#define SC_IDLE 1
			
 
				-#define SC_SPEED 2
			
 
				+
			
 
				 struct size_request
			
 
				 {
			
 
				 	int *workers;
			
@@ -85,6 +83,9 @@ struct sc_hypervisor
 
				 	
			
 
				 	/* criteria to trigger resizing */
			
 
				 	unsigned resize_criteria;
			
 
				+
			
 
				+	/* value of the speed to compare the speed of the context to */
			
 
				+	double optimal_v[STARPU_NMAX_SCHED_CTXS];
			
 
				 };
			
 
				 
			
 
				 struct sc_hypervisor_adjustment
			
@@ -102,4 +103,5 @@ void _remove_config(unsigned sched_ctx);
 
				 
			
 
				 double _get_max_velocity_gap();
			
 
				 
			
 
				-unsigned _get_resize_criteria();
			
 
				+double _get_optimal_v(unsigned sched_ctx);
			
 
				+void _set_optimal_v(unsigned sched_ctx, double optimal_v);
			
--- a/socl/Makefile.am
+++ b/socl/Makefile.am
@@ -17,6 +17,9 @@ SUBDIRS = src examples
 
				 
			
 
				 EXTRA_DIST = README
			
 
				 
			
 
				+SOCL_vendorsdir = @datarootdir@/starpu/opencl/vendors
			
 
				+dist_SOCL_vendors_DATA = @SOCL_VENDORS@
			
 
				+
			
 
				 showcheck:
			
 
				 	for i in $(SUBDIRS) ; do \
			
 
				 		make -C $$i showcheck ; \
			
--- a/socl/vendors/install/socl.icd.in
+++ b/socl/vendors/install/socl.icd.in
@@ -0,0 +1 @@
 
				+@prefix@/lib/libsocl-@STARPU_EFFECTIVE_VERSION@.so
			
--- a/src/core/combined_workers.c
+++ b/src/core/combined_workers.c
@@ -34,7 +34,7 @@ static int compar_int(const void *pa, const void *pb)
 
				 	int a = *((int *)pa);
			
 
				 	int b = *((int *)pb);
			
 
				 
			
 
				-	return a > b;
			
 
				+	return a - b;
			
 
				 }
			
 
				 
			
 
				 static void sort_workerid_array(int nworkers, int workerid_array[])
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -481,12 +481,8 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 
				 	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
			
 
				 	unsigned do_submit_tasks = 0;
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				-
			
 
				-	if (handle->sequential_consistency)
			
 
				+	if (handle->post_sync_tasks_cnt > 0)
			
 
				 	{
			
 
				-		STARPU_ASSERT(handle->post_sync_tasks_cnt > 0);
			
 
				-
			
 
				 		if (--handle->post_sync_tasks_cnt == 0)
			
 
				 		{
			
 
				 			/* unlock all tasks : we need not hold the lock while unlocking all these tasks */
			
@@ -496,8 +492,6 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				-
			
 
				 	if (do_submit_tasks)
			
 
				 	{
			
 
				 		struct _starpu_task_wrapper_list *link = post_sync_tasks;
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -416,7 +416,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 
				 	/* Dump the history into the model file in case it is necessary */
			
 
				 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				 	{
			
 
				-		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
			
 
				+		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean (us)\t\tdev (us)\t\tsum\t\tsum2\t\tn\n");
			
 
				 		ptr = per_arch_model->list;
			
 
				 		while (ptr)
			
 
				 		{
			
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -28,7 +28,7 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
				 	ptr = per_arch_model->list;
			
 
				 
			
 
				 	if (!parameter && ptr)
			
 
				-		fprintf(output, "# hash\t\tsize\t\tflops\t\tmean\t\tstddev\t\tn\n");
			
 
				+		fprintf(output, "# hash\t\tsize\t\tflops\t\tmean (us)\t\tstddev (us)\t\tn\n");
			
 
				 
			
 
				 	while (ptr)
			
 
				 	{
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -540,36 +540,39 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
				 
			
 
				 struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker *worker)
			
 
				 {
			
 
				-	struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
			
 
				-	unsigned smallest_counter =  worker->nsched_ctxs;
			
 
				-	unsigned i;
			
 
				-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				-	{
			
 
				-		sched_ctx = worker->sched_ctx[i];
			
 
				-
			
 
				-		if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS && worker->removed_from_ctx[sched_ctx->id])
			
 
				-			return sched_ctx;
			
 
				-		if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
			
 
				-		   sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
			
 
				-		   smallest_counter > sched_ctx->pop_counter[worker->workerid])
			
 
				-		{
			
 
				-			good_sched_ctx = sched_ctx;
			
 
				-			smallest_counter = sched_ctx->pop_counter[worker->workerid];
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if(good_sched_ctx == NULL)
			
 
				+	while(1)
			
 
				 	{
			
 
				+		struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
			
 
				+		unsigned smallest_counter =  worker->nsched_ctxs;
			
 
				+		unsigned i;
			
 
				 		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				 		{
			
 
				 			sched_ctx = worker->sched_ctx[i];
			
 
				-			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
			
 
				-				sched_ctx->pop_counter[worker->workerid] = 0;
			
 
				+			
			
 
				+			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS && worker->removed_from_ctx[sched_ctx->id])
			
 
				+				return sched_ctx;
			
 
				+			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
			
 
				+			   sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
			
 
				+			   smallest_counter > sched_ctx->pop_counter[worker->workerid])
			
 
				+			{
			
 
				+				good_sched_ctx = sched_ctx;
			
 
				+				smallest_counter = sched_ctx->pop_counter[worker->workerid];
			
 
				+			}
			
 
				 		}
			
 
				-
			
 
				-		return _get_next_sched_ctx_to_pop_into(worker);
			
 
				+		
			
 
				+		if(good_sched_ctx == NULL)
			
 
				+		{
			
 
				+			for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				+			{
			
 
				+				sched_ctx = worker->sched_ctx[i];
			
 
				+				if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
			
 
				+					sched_ctx->pop_counter[worker->workerid] = 0;
			
 
				+			}
			
 
				+			
			
 
				+			continue;
			
 
				+		}
			
 
				+		return good_sched_ctx;
			
 
				 	}
			
 
				-	return good_sched_ctx;
			
 
				 }
			
 
				 
			
 
				 struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker)
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -213,6 +213,7 @@ static int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch,
 
				 	case STARPU_ANY_WORKER:
			
 
				 	{
			
 
				 		int cpu_func_enabled=1, cuda_func_enabled=1, opencl_func_enabled=1;
			
 
				+		/* TODO: MIC/SCC */
			
 
				 
			
 
				 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
			
 
				 		starpu_cpu_func_t cpu_func = _starpu_task_get_cpu_nth_implementation(cl, nimpl);
			
@@ -301,8 +302,9 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 
				 
			
 
				 			/* Is the worker larger than requested ? */
			
 
				 			int worker_size = (int)config.combined_workers[workerid - nworkers].worker_size;
			
 
				+			int worker0 = config.combined_workers[workerid - nworkers].combined_workerid[0];
			
 
				 			return !!((worker_size <= task->cl->max_parallelism) &&
			
 
				-				_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
			
 
				+				_starpu_can_use_nth_implementation(config.workers[worker0].arch, task->cl, nimpl));
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -20,22 +20,24 @@
 
				 #include <datawizard/footprint.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-/* This per-node RW-locks protect mc_list and memchunk_cache entries */
			
 
				-/* Note: handle header lock is always taken before this */
			
 
				-static starpu_pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
			
 
				-
			
 
				 /* This per-node spinlock protect lru_list */
			
 
				 static struct _starpu_spinlock lru_rwlock[STARPU_MAXNODES];
			
 
				 
			
 
				 /* Last Recently used memory chunkgs */
			
 
				 static struct _starpu_mem_chunk_lru_list *starpu_lru_list[STARPU_MAXNODES];
			
 
				 
			
 
				+
			
 
				+/* This per-node RW-locks protect mc_list and memchunk_cache entries */
			
 
				+/* Note: handle header lock is always taken before this */
			
 
				+static starpu_pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
			
 
				+
			
 
				 /* Potentially in use memory chunks */
			
 
				 static struct _starpu_mem_chunk_list *mc_list[STARPU_MAXNODES];
			
 
				 
			
 
				 /* Explicitly caches memory chunks that can be reused */
			
 
				 static struct _starpu_mem_chunk_list *memchunk_cache[STARPU_MAXNODES];
			
 
				 
			
 
				+
			
 
				 /* When reclaiming memory to allocate, we reclaim MAX(what_is_to_reclaim_on_device, data_size_coefficient*data_size) */
			
 
				 const unsigned starpu_memstrategy_data_size_coefficient=2;
			
 
				 
			
@@ -71,22 +73,6 @@ void _starpu_deinit_mem_chunk_lists(void)
 
				  *	Manipulate subtrees
			
 
				  */
			
 
				 
			
 
				-static void lock_all_subtree(starpu_data_handle_t handle)
			
 
				-{
			
 
				-	unsigned child;
			
 
				-
			
 
				-	/* lock parent */
			
 
				-	while (_starpu_spin_trylock(&handle->header_lock))
			
 
				-		_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				-
			
 
				-	/* lock all sub-subtrees children */
			
 
				-	for (child = 0; child < handle->nchildren; child++)
			
 
				-	{
			
 
				-		starpu_data_handle_t child_handle = starpu_data_get_child(handle, child);
			
 
				-		lock_all_subtree(child_handle);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static void unlock_all_subtree(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	/* lock all sub-subtrees children
			
@@ -103,6 +89,30 @@ static void unlock_all_subtree(starpu_data_handle_t handle)
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				+static int lock_all_subtree(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	int child;
			
 
				+
			
 
				+	/* lock parent */
			
 
				+	if (_starpu_spin_trylock(&handle->header_lock))
			
 
				+		/* the handle is busy, abort */
			
 
				+		return 0;
			
 
				+
			
 
				+	/* lock all sub-subtrees children */
			
 
				+	for (child = 0; child < (int) handle->nchildren; child++)
			
 
				+	{
			
 
				+		if (!lock_all_subtree(starpu_data_get_child(handle, child))) {
			
 
				+			/* Some child is busy, abort */
			
 
				+			while (--child >= 0)
			
 
				+				/* Unlock what we have already uselessly locked */
			
 
				+				unlock_all_subtree(starpu_data_get_child(handle, child));
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				 static unsigned may_free_subtree(starpu_data_handle_t handle, unsigned node)
			
 
				 {
			
 
				 	/* we only free if no one refers to the leaf */
			
@@ -330,8 +340,9 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
				 	{
			
 
				 		STARPU_ASSERT(mc->replicate);
			
 
				 
			
 
				-		while (_starpu_spin_trylock(&handle->header_lock))
			
 
				-			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				+		if (_starpu_spin_trylock(&handle->header_lock))
			
 
				+			/* Handle is busy, abort */
			
 
				+			return 0;
			
 
				 
			
 
				 		if (mc->replicate->refcnt == 0)
			
 
				 		{
			
@@ -344,11 +355,9 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
				 
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 	}
			
 
				-	else
			
 
				+	/* try to lock all the subtree */
			
 
				+	else if (lock_all_subtree(handle))
			
 
				 	{
			
 
				-		/* try to lock all the subtree */
			
 
				-		lock_all_subtree(handle);
			
 
				-
			
 
				 		/* check if they are all "free" */
			
 
				 		if (may_free_subtree(handle, node))
			
 
				 		{
			
@@ -385,7 +394,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		/* unlock the leafs */
			
 
				+		/* unlock the tree */
			
 
				 		unlock_all_subtree(handle);
			
 
				 	}
			
 
				 	return freed;
			
@@ -441,23 +450,24 @@ static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned no
 
				 	STARPU_ASSERT(old_data);
			
 
				 
			
 
				 	/* try to lock all the subtree */
			
 
				-	lock_all_subtree(old_data);
			
 
				-
			
 
				-	/* check if they are all "free" */
			
 
				-	if (may_free_subtree(old_data, node))
			
 
				+	/* and check if they are all "free" */
			
 
				+	if (lock_all_subtree(old_data))
			
 
				 	{
			
 
				-		success = 1;
			
 
				+		if (may_free_subtree(old_data, node))
			
 
				+		{
			
 
				+			success = 1;
			
 
				 
			
 
				-		/* in case there was nobody using that buffer, throw it
			
 
				-		 * away after writing it back to main memory */
			
 
				-		transfer_subtree_to_node(old_data, node, 0);
			
 
				+			/* in case there was nobody using that buffer, throw it
			
 
				+			 * away after writing it back to main memory */
			
 
				+			transfer_subtree_to_node(old_data, node, 0);
			
 
				 
			
 
				-		/* now replace the previous data */
			
 
				-		reuse_mem_chunk(node, replicate, mc, is_already_in_mc_list);
			
 
				-	}
			
 
				+			/* now replace the previous data */
			
 
				+			reuse_mem_chunk(node, replicate, mc, is_already_in_mc_list);
			
 
				+		}
			
 
				 
			
 
				-	/* unlock the leafs */
			
 
				-	unlock_all_subtree(old_data);
			
 
				+		/* unlock the tree */
			
 
				+		unlock_all_subtree(old_data);
			
 
				+	}
			
 
				 
			
 
				 	return success;
			
 
				 }
			
@@ -547,19 +557,27 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
 
				 static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
			
 
				 {
			
 
				 	struct _starpu_mem_chunk *mc;
			
 
				+	struct _starpu_mem_chunk_list *busy_memchunk_cache;
			
 
				 
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				+	if (_starpu_mem_chunk_list_empty(memchunk_cache[node]))
			
 
				+		return 0;
			
 
				+
			
 
				+	busy_memchunk_cache = _starpu_mem_chunk_list_new();
			
 
				+
			
 
				 	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				 	while (!_starpu_mem_chunk_list_empty(memchunk_cache[node])) {
			
 
				 		mc = _starpu_mem_chunk_list_pop_front(memchunk_cache[node]);
			
 
				-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				-
			
 
				 		starpu_data_handle_t handle = mc->data;
			
 
				 
			
 
				 		if (handle)
			
 
				-			while (_starpu_spin_trylock(&handle->header_lock))
			
 
				-				_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
			
 
				+			if (_starpu_spin_trylock(&handle->header_lock)) {
			
 
				+				/* The handle is still busy, leave this chunk for later */
			
 
				+				_starpu_mem_chunk_list_push_front(busy_memchunk_cache, mc);
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				 		freed += free_memory_on_node(mc, node);
			
 
				 		if (handle)
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
@@ -567,10 +585,11 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 
				 		free(mc->chunk_interface);
			
 
				 		_starpu_mem_chunk_delete(mc);
			
 
				 
			
 
				-		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				-		if (reclaim && freed>reclaim)
			
 
				+		if (reclaim && freed >= reclaim)
			
 
				 			break;
			
 
				 	}
			
 
				+	_starpu_mem_chunk_list_push_list_front(busy_memchunk_cache, memchunk_cache[node]);
			
 
				+	_starpu_mem_chunk_list_delete(busy_memchunk_cache);
			
 
				 	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				 	return freed;
			
 
				 }
			
@@ -585,7 +604,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 
				 {
			
 
				 	size_t freed = 0;
			
 
				 
			
 
				-	struct _starpu_mem_chunk *mc, *next_mc = (void*) -1;
			
 
				+	struct _starpu_mem_chunk *mc, *next_mc;
			
 
				 
			
 
				 	/*
			
 
				 	 * We have to unlock mc_rwlock before locking header_lock, so we have
			
@@ -595,50 +614,37 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 
				 	 * finding anything to free.
			
 
				 	 */
			
 
				 
			
 
				-	while (1)
			
 
				-	{
			
 
				-		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				-
			
 
				-		if (_starpu_mem_chunk_list_empty(mc_list[node]) || !next_mc)
			
 
				-		{
			
 
				-			STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				-			/* We reached the end of the list :/ */
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		if (next_mc == (void*) -1) {
			
 
				-			/* First iteration ever, start from beginning */
			
 
				-			mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-		} else {
			
 
				-			/* Try to restart from where we were */
			
 
				-			for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-			     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				-			     mc = _starpu_mem_chunk_list_next(mc))
			
 
				-				if (mc == next_mc)
			
 
				-					/* Found it, restart from there.  */
			
 
				-					break;
			
 
				-
			
 
				-			if (mc == _starpu_mem_chunk_list_end(mc_list[node]))
			
 
				-				/* Couldn't find next_mc, restart from the beginning :/ */
			
 
				-				mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				-		}
			
 
				+restart:
			
 
				+	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				-		/* Remember where to try next */
			
 
				+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
			
 
				+	     mc = next_mc)
			
 
				+	{
			
 
				+		/* mc hopefully gets out of the list, we thus need to prefetch
			
 
				+		 * the next element */
			
 
				 		next_mc = _starpu_mem_chunk_list_next(mc);
			
 
				-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				 		if (!force)
			
 
				 		{
			
 
				 			freed += try_to_free_mem_chunk(mc, node);
			
 
				 
			
 
				-			if (reclaim && freed > reclaim)
			
 
				+			if (reclaim && freed >= reclaim)
			
 
				 				break;
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			starpu_data_handle_t handle = mc->data;
			
 
				 
			
 
				-			_starpu_spin_lock(&handle->header_lock);
			
 
				+			if (_starpu_spin_trylock(&handle->header_lock))
			
 
				+			{
			
 
				+				/* Ergl. We are shutting down, but somebody is
			
 
				+				 * still locking the handle. That's not
			
 
				+				 * supposed to happen, but better be safe by
			
 
				+				 * letting it go through. */
			
 
				+				STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+				goto restart;
			
 
				+			}
			
 
				 
			
 
				 			/* We must free the memory now, because we are
			
 
				 			 * terminating the drivers: note that data coherency is
			
@@ -648,6 +654,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 		}
			
 
				 	}
			
 
				+	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				 
			
 
				 	return freed;
			
 
				 }
			
@@ -763,8 +770,12 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
				 		_starpu_mem_chunk_delete(mc);
			
 
				 	}
			
 
				 	else
			
 
				+	{
			
 
				 		/* put it in the list of buffers to be removed */
			
 
				+		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
 
				 		_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
			
 
				+		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -909,6 +920,7 @@ unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, unsi
 
				 	return handle->per_node[memory_node].allocated;
			
 
				 }
			
 
				 
			
 
				+/* Record that this memchunk has been recently used */
			
 
				 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
			
 
				 {
			
 
				 	_starpu_spin_lock(&lru_rwlock[node]);
			
@@ -918,10 +930,11 @@ void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
 
				 	_starpu_spin_unlock(&lru_rwlock[node]);
			
 
				 }
			
 
				 
			
 
				+/* Push the given memchunk, recently used, at the end of the chunks to be evicted */
			
 
				 /* The mc_rwlock[node] rw-lock should be taken prior to calling this function.*/
			
 
				 static void _starpu_memchunk_recently_used_move(struct _starpu_mem_chunk *mc, unsigned node)
			
 
				 {
			
 
				-	/* XXX Sometimes the memchunk is not in the list... */
			
 
				+	/* Note: Sometimes the memchunk is not in the list... */
			
 
				 	struct _starpu_mem_chunk *mc_iter;
			
 
				 	for (mc_iter = _starpu_mem_chunk_list_begin(mc_list[node]);
			
 
				 	     mc_iter != _starpu_mem_chunk_list_end(mc_list[node]);
			
@@ -937,6 +950,9 @@ static void _starpu_memchunk_recently_used_move(struct _starpu_mem_chunk *mc, un
 
				 	}
			
 
				 }
			
 
				 
			
 
				+/* Put the recently used memchunks at the end of the mc_list, in the same order
			
 
				+ * as the LRU list, so that the most recently used memchunk eventually comes
			
 
				+ * last in the mc_list */
			
 
				 static void starpu_lru(unsigned node)
			
 
				 {
			
 
				 	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -75,7 +75,8 @@ static void _starpu_data_acquire_fetch_data_callback(void *arg)
 
				 	 * We enqueue the "post" sync task in the list associated to the handle
			
 
				 	 * so that it is submitted by the starpu_data_release
			
 
				 	 * function. */
			
 
				-	_starpu_add_post_sync_tasks(wrapper->post_sync_task, handle);
			
 
				+	if (wrapper->post_sync_task)
			
 
				+		_starpu_add_post_sync_tasks(wrapper->post_sync_task, handle);
			
 
				 
			
 
				 	wrapper->callback(wrapper->callback_arg);
			
 
				 
			
@@ -114,8 +115,9 @@ static void starpu_data_acquire_cb_pre_sync_callback(void *arg)
 
				 }
			
 
				 
			
 
				 /* The data must be released by calling starpu_data_release later on */
			
 
				-int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
			
 
				-			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
			
 
				+int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, unsigned node,
			
 
				+							  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg,
			
 
				+							  int sequential_consistency)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data (%p) is not possible", handle);
			
@@ -132,10 +134,12 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
 
				 	STARPU_PTHREAD_COND_INIT(&wrapper->cond, NULL);
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
			
 
				 	wrapper->finished = 0;
			
 
				+	wrapper->pre_sync_task = NULL;
			
 
				+	wrapper->post_sync_task = NULL;
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
			
 
				-	int sequential_consistency = handle->sequential_consistency;
			
 
				-	if (sequential_consistency)
			
 
				+	int handle_sequential_consistency = handle->sequential_consistency;
			
 
				+	if (handle_sequential_consistency && sequential_consistency)
			
 
				 	{
			
 
				 		struct starpu_task *new_task;
			
 
				 		wrapper->pre_sync_task = starpu_task_create();
			
@@ -177,12 +181,25 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+
			
 
				+int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
			
 
				+				   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
			
 
				+{
			
 
				+	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, node, mode, callback, arg, 1);
			
 
				+}
			
 
				+
			
 
				 int starpu_data_acquire_cb(starpu_data_handle_t handle,
			
 
				 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
			
 
				 {
			
 
				 	return starpu_data_acquire_on_node_cb(handle, 0, mode, callback, arg);
			
 
				 }
			
 
				 
			
 
				+int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
			
 
				+						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
			
 
				+{
			
 
				+	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, 0, mode, callback, arg, sequential_consistency);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  *	Block data request from application
			
 
				  */
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -160,7 +160,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 	poti_DefineEntityValue("Fi", "S", "FetchingInput", "1.0 .1 1.0");
			
 
				 	poti_DefineEntityValue("Po", "S", "PushingOutput", "0.1 1.0 1.0");
			
 
				 	poti_DefineEntityValue("C", "S", "Callback", ".0 .3 .8");
			
 
				-	poti_DefineEntityValue("B", "S", "Blocked", ".9 .1 .0");
			
 
				+	poti_DefineEntityValue("B", "S", "Overhead", ".5 .18 .0");
			
 
				 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
			
 
				 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
			
 
				 
			
@@ -187,7 +187,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 		poti_DefineEntityValue("Fi", ctx, "FetchingInput", "1.0 .1 1.0");
			
 
				 		poti_DefineEntityValue("Po", ctx, "PushingOutput", "0.1 1.0 1.0");
			
 
				 		poti_DefineEntityValue("C", ctx, "Callback", ".0 .3 .8");
			
 
				-		poti_DefineEntityValue("B", ctx, "Blocked", ".9 .1 .0");
			
 
				+		poti_DefineEntityValue("B", ctx, "Overhead", ".5 .18 .0");
			
 
				 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
			
 
				 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
			
 
				 	}
			
@@ -226,7 +226,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 6       Fi       S      FetchingInput       \"1.0 .1 1.0\"            \n\
			
 
				 6       Po       S      PushingOutput       \"0.1 1.0 1.0\"            \n\
			
 
				 6       C       S       Callback       \".0 .3 .8\"            \n\
			
 
				-6       B       S       Blocked         \".9 .1 .0\"		\n\
			
 
				+6       B       S       Overhead         \".5 .18 .0\"		\n\
			
 
				 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
			
 
				 6       P       S       Progressing         \".4 .1 .6\"		\n");
			
 
				 	fprintf(file, "\
			
@@ -245,7 +245,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 6       Fi       Ctx%u      FetchingInput       \"1.0 .1 1.0\"            \n\
			
 
				 6       Po       Ctx%u      PushingOutput       \"0.1 1.0 1.0\"            \n\
			
 
				 6       C       Ctx%u       Callback       \".0 .3 .8\"            \n\
			
 
				-6       B       Ctx%u       Blocked         \".9 .1 .0\"		\n\
			
 
				+6       B       Ctx%u       Overhead         \".5 .18 .0\"		\n\
			
 
				 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
			
 
				 6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n",
			
 
				 		i, i, i, i, i, i, i, i);
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -547,13 +547,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				 
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		starpu_pthread_mutex_t *sched_mutex;
			
 
				-		starpu_pthread_cond_t *sched_cond;
			
 
				-		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
			
 
				-
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				 
			
 
				 		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 	 	{
			
@@ -639,7 +633,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 			if (unknown)
			
 
				 				continue;
			
 
				 
			
 
				-			exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx][nimpl];
			
 
				+			exp_end[worker_ctx][nimpl] = exp_start + fifo->exp_len + local_task_length[worker_ctx][nimpl];
			
 
				 
			
 
				 			if (exp_end[worker_ctx][nimpl] < best_exp_end)
			
 
				 			{
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -141,6 +141,7 @@ noinst_PROGRAMS =				\
 
				 	main/starpu_init			\
			
 
				 	main/starpu_worker_exists		\
			
 
				 	main/submit				\
			
 
				+	main/codelet_null_callback		\
			
 
				 	datawizard/allocate			\
			
 
				 	datawizard/acquire_cb			\
			
 
				 	datawizard/acquire_cb_insert		\
			
@@ -232,10 +233,6 @@ noinst_PROGRAMS +=				\
 
				 	datawizard/reclaim
			
 
				 endif
			
 
				 
			
 
				-noinst_nulldir=/tmp
			
 
				-noinst_null_PROGRAMS =				\
			
 
				-	$(LOADER)
			
 
				-
			
 
				 examplebin_PROGRAMS = \
			
 
				 	microbenchs/tasks_size_overhead		\
			
 
				 	microbenchs/local_pingpong
			
@@ -243,7 +240,11 @@ examplebin_SCRIPTS = \
 
				 	microbenchs/tasks_size_overhead.gp \
			
 
				 	microbenchs/tasks_size_overhead.sh
			
 
				 
			
 
				-check_PROGRAMS = $(noinst_PROGRAMS) $(noinst_2_PROGRAMS)
			
 
				+if STARPU_HAVE_WINDOWS
			
 
				+check_PROGRAMS	=	$(noinst_PROGRAMS)
			
 
				+else
			
 
				+check_PROGRAMS	=	$(LOADER) $(noinst_PROGRAMS)
			
 
				+endif
			
 
				 
			
 
				 #######################
			
 
				 # Source files        #
			
--- a/tests/main/codelet_null_callback.c
+++ b/tests/main/codelet_null_callback.c
@@ -0,0 +1,46 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+void callback(void *ptr)
			
 
				+{
			
 
				+     int *x = (int *)ptr;
			
 
				+     FPRINTF(stderr, "x=%d\n", *x);
			
 
				+     STARPU_ASSERT(*x == 42);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+	int x=42;
			
 
				+
			
 
				+	ret = starpu_initialize(NULL, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	ret = starpu_insert_task(NULL,
			
 
				+				 STARPU_CALLBACK_WITH_ARG, callback, &x,
			
 
				+				 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
 
				+
		`@@ -0,0 +1 @@`
		`+@prefix@/lib/libsocl-@STARPU_EFFECTIVE_VERSION@.so`