лет назад: 13 · f4fb7d745d
--- a/ChangeLog
+++ b/ChangeLog
@@ -93,6 +93,18 @@ New features:
 
				   * Introduce new variables STARPU_LIMIT_CUDA_devid_MEM and
			
 
				     STARPU_LIMIT_OPENCL_devid_MEM to limit memory per specific device
			
 
				 
			
 
				+Small features:
			
 
				+  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
			
 
				+  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
			
 
				+    pause trace recording.
			
 
				+  * Add trace_buffer_size configuration field to permit to specify the tracing
			
 
				+    buffer size.
			
 
				+  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
			
 
				+    the profile of a codelet.
			
 
				+  * File STARPU-REVISION --- containing the SVN revision number from which
			
 
				+    StarPU was compiled --- is installed in the share/doc/starpu directory
			
 
				+  * starpu_perfmodel_plot can now directly draw GFlops curves.
			
 
				+
			
 
				 Changes:
			
 
				   * Fix the block filter functions.
			
 
				   * Fix StarPU-MPI on Darwin.
			
@@ -127,15 +139,6 @@ Changes:
 
				   * StarPU can now use poti to generate paje traces.
			
 
				   * Rename scheduling policy "parallel greedy" to "parallel eager"
			
 
				 
			
 
				-Small features:
			
 
				-  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
			
 
				-  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
			
 
				-  pause trace recording.
			
 
				-  * Add trace_buffer_size configuration field to permit to specify the tracing
			
 
				-  buffer size.
			
 
				-  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
			
 
				-  the profile of a codelet.
			
 
				-
			
 
				 Small changes:
			
 
				   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
			
 
				 	still available for compatibility reasons.
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -91,7 +91,7 @@ all-local:
 
				 	cd starpu-top ; $(QMAKE) ; $(MAKE)
			
 
				 clean-local:
			
 
				 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
			
 
				-	$(RM) starpu_top.1 starpu-top/starpu_top
			
 
				+	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top
			
 
				 # TODO: resources
			
 
				 install-exec-local:
			
 
				 	$(MKDIR_P) $(DESTDIR)$(bindir)
			
@@ -102,10 +102,10 @@ uninstall-local:
 
				 	$(RM) starpu-top/Makefile
			
 
				 
			
 
				 if STARPU_HAVE_HELP2MAN
			
 
				-starpu_top.1: starpu-top/starpu_top$(EXEEXT)
			
 
				+starpu-top/starpu_top.1: starpu-top/starpu_top$(EXEEXT)
			
 
				 	help2man --no-discard-stderr -N --output=$@ starpu-top/starpu_top$(EXEEXT)
			
 
				 dist_man1_MANS =\
			
 
				-	starpu_top.1
			
 
				+	starpu-top/starpu_top.1
			
 
				 endif
			
 
				 endif
			
 
				 
			
@@ -114,8 +114,8 @@ txtdir = ${prefix}
 
				 else
			
 
				 txtdir = ${docdir}
			
 
				 endif
			
 
				-txt_DATA = AUTHORS COPYING.LGPL README
			
 
				-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION build-aux/svn2cl.xsl
			
 
				+txt_DATA = AUTHORS COPYING.LGPL README STARPU-REVISION
			
 
				+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl
			
 
				 
			
 
				 include starpu-top/extradist
			
 
				 
			
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,21 @@ AC_C_RESTRICT
 
				 # Check if bash is available
			
 
				 AC_CHECK_PROGS([BASH], [bash])
			
 
				 
			
 
				+# Check whether subversion is installed
			
 
				+AC_PATH_PROG(svnversioncommand, svnversion)
			
 
				+
			
 
				+# use svnversion to record the current repository revision only if
			
 
				+# subversion is installed and we are in a working copy
			
 
				+if test "$svnversioncommand" = "" || test `LC_ALL=C $svnversioncommand -n $srcdir` = "exported" ; then
			
 
				+   if test -f $srcdir/STARPU-REVISION ; then
			
 
				+      cp $srcdir/STARPU-REVISION .
			
 
				+   else
			
 
				+      echo "unknown" > ./STARPU-REVISION
			
 
				+   fi
			
 
				+else
			
 
				+   LC_ALL=C svnversion $srcdir > ./STARPU-REVISION
			
 
				+fi
			
 
				+
			
 
				 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
			
 
				 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
			
 
				 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
			
@@ -1327,12 +1342,17 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 
				 IS_SUPPORTED_CFLAG(-W)
			
 
				 IS_SUPPORTED_CFLAG(-Wall)
			
 
				 IS_SUPPORTED_CFLAG(-Wextra)
			
 
				-AC_SUBST(GLOBAL_AM_CFLAGS)
			
 
				+IS_SUPPORTED_CFLAG(-Werror=implicit)
			
 
				 
			
 
				 if test "x$STARPU_DEVEL" != x; then
			
 
				 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
			
 
				+	IS_SUPPORTED_CFLAG(-Wunused)
			
 
				+	IS_SUPPORTED_CFLAG(-Wundef)
			
 
				+	IS_SUPPORTED_CFLAG(-Wshadow)
			
 
				 fi
			
 
				 
			
 
				+AC_SUBST(GLOBAL_AM_CFLAGS)
			
 
				+
			
 
				 # Same value as Automake's, for use in other places.
			
 
				 pkglibdir="\${libdir}/$PACKAGE"
			
 
				 AC_SUBST([pkglibdir])
			
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009, 2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # Permission is granted to copy, distribute and/or modify this document
			
 
				 # under the terms of the GNU Free Documentation License, Version 1.3
			
@@ -12,7 +12,7 @@
 
				 
			
 
				 info_TEXINFOS = starpu.texi
			
 
				 
			
 
				-starpu_TEXINFOS = chapters/advanced-api.texi \
			
 
				+chapters =	chapters/advanced-api.texi \
			
 
				 	chapters/benchmarks.texi \
			
 
				 	chapters/configuration.texi \
			
 
				 	chapters/perf-feedback.texi \
			
@@ -35,9 +35,12 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 
				 	chapters/using.texi \
			
 
				 	chapters/vector_scal_opencl.texi \
			
 
				 	chapters/socl.texi \
			
 
				-	chapters/version.texi \
			
 
				 	chapters/sched_ctx_hypervisor.texi
			
 
				 
			
 
				+starpu_TEXINFOS = 		\
			
 
				+	chapters/version.texi 	\
			
 
				+	$(chapters)
			
 
				+
			
 
				 MAINTAINERCLEANFILES = starpu.pdf starpu.html
			
 
				 
			
 
				 EXTRA_DIST = starpu.css
			
@@ -50,7 +53,7 @@ AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers -
 
				 uninstall-local:
			
 
				 	$(RM) $(DESTDIR)$(infodir)/dir
			
 
				 
			
 
				-chapters/version.texi:
			
 
				+chapters/version.texi: $(chapters)
			
 
				 	@-for f in $(starpu_TEXINFOS) ; do \
			
 
				                 if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f 2>/dev/null ; fi \
			
 
				         done | sort -r | head -1 > timestamp
			
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -38,7 +38,7 @@ The arguments following the codelets can be of the following types:
 
				 @item
			
 
				 the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
			
 
				 @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
			
 
				-@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
			
 
				+@code{STARPU_PRIORITY}, @code{STARPU_TAG}, @code{STARPU_FLOPS}, followed by the appropriated objects
			
 
				 as defined below.
			
 
				 @end itemize
			
 
				 
			
@@ -85,6 +85,12 @@ this macro is used when calling @code{starpu_insert_task}, and must be
 
				 followed by a tag.
			
 
				 @end defmac
			
 
				 
			
 
				+@defmac STARPU_FLOPS
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be followed
			
 
				+by an amount of floating point operations, as a double. The user may have to
			
 
				+explicitly cast into double, otherwise parameter passing will not work.
			
 
				+@end defmac
			
 
				+
			
 
				 @deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
			
 
				 Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
			
 
				 given to a codelet and later unpacked with the function
			
@@ -165,24 +171,6 @@ to the world size. Communications statistics must be enabled
 
				 @node Communication
			
 
				 @subsection Communication
			
 
				 
			
 
				-The standard point to point communications of MPI have been
			
 
				-implemented. The semantic is similar to the MPI one, but adapted to
			
 
				-the DSM provided by StarPU. A MPI request will only be submitted when
			
 
				-the data is available in the main memory of the node submitting the
			
 
				-request.
			
 
				-
			
 
				-There is two types of asynchronous communications: the classic
			
 
				-asynchronous communications and the detached communications. The
			
 
				-classic asynchronous communications (@code{starpu_mpi_isend} and
			
 
				-@code{starpu_mpi_irecv}) need to be followed by a call to
			
 
				-@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
			
 
				-test the completion of the communication. Waiting for or testing the
			
 
				-completion of detached communications is not possible, this is done
			
 
				-internally by StarPU-MPI, on completion, the resources are
			
 
				-automatically released. This mechanism is similar to the pthread
			
 
				-detach state attribute which determines whether a thread will be
			
 
				-created in a joinable or a detached state.
			
 
				-
			
 
				 @deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
			
 
				 Performs a standard-mode, blocking send of @var{data_handle} to the
			
 
				 node @var{dest} using the message tag @code{mpi_tag} within the
			
@@ -354,51 +342,56 @@ Unpack the data handle from the contiguous buffer at the address @code{ptr} of s
 
				 @end deftp
			
 
				 
			
 
				 @deftp {Data Type} {struct starpu_data_copy_methods}
			
 
				-Defines the per-interface methods.
			
 
				+Defines the per-interface methods. If the @code{any_to_any} method is provided,
			
 
				+it will be used by default if no more specific method is provided. It can still
			
 
				+be useful to provide more specific method in case of e.g. available particular
			
 
				+CUDA or OpenCL support.
			
 
				+
			
 
				 @table @asis
			
 
				-@item @code{int @{ram,cuda,opencl@}_to_@{ram,cuda,opencl@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
			
 
				+@item @code{int (*@{ram,cuda,opencl@}_to_@{ram,cuda,opencl@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
			
 
				 These 12 functions define how to copy data from the @var{src_interface}
			
 
				 interface on the @var{src_node} node to the @var{dst_interface} interface
			
 
				 on the @var{dst_node} node. They return 0 on success.
			
 
				 
			
 
				-@item @code{int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
			
 
				-@var{dst_node} node (on a CUDA device), using the given @var{stream}. Return 0
			
 
				-on success.
			
 
				-
			
 
				-@item @code{int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on the
			
 
				-@var{dst_node} node (in RAM), using the given @var{stream}. Return 0
			
 
				-on success.
			
 
				-
			
 
				-@item @code{int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				+@item @code{int (*@{ram,cuda@}_to_@{ram,cuda@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				+These 3 functions (@code{ram_to_ram} is not among these) define how to copy
			
 
				+data from the @var{src_interface} interface on the @var{src_node} node to the
			
 
				+@var{dst_interface} interface on the @var{dst_node} node, using the given
			
 
				+@var{stream}. Must return 0 if the transfer was actually completed completely
			
 
				+synchronously, or -EAGAIN if at least some transfers are still ongoing and
			
 
				+should be awaited for by the core.
			
 
				+
			
 
				+@item @code{int (*@{ram,opencl@}_to_@{ram,opencl@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				+These 3 functions (@code{ram_to_ram} is not among them) define how to copy
			
 
				+data from the @var{src_interface} interface on the @var{src_node} node to the
			
 
				+@var{dst_interface} interface on the @var{dst_node} node, by recording in
			
 
				+@var{event}, a pointer to a cl_event, the event of the last submitted transfer.
			
 
				+Must return 0 if the transfer was actually completed completely synchronously,
			
 
				+or -EAGAIN if at least some transfers are still ongoing and should be awaited
			
 
				+for by the core.
			
 
				+
			
 
				+@item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
			
 
				 Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on
			
 
				-the @var{dst_node} node (on another CUDA device), using the given @var{stream}.
			
 
				-Return 0 on success.
			
 
				+@var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
			
 
				+node. This is meant to be implemented through the @var{starpu_interface_copy}
			
 
				+helper, to which @var{async_data} should be passed as such, and will be used to
			
 
				+manage asynchronicity. This must return -EAGAIN if any of the
			
 
				+@var{starpu_interface_copy} calls has returned -EAGAIN (i.e. at least some
			
 
				+transfer is still ongoing), and return 0 otherwise.
			
 
				 
			
 
				-@item @code{int (*ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
			
 
				-@var{dst_node} node (on an OpenCL device), using @var{event}, a pointer to a
			
 
				-cl_event. Return 0 on success.
			
 
				-
			
 
				-@item @code{int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
			
 
				-on the @var{dst_node} node (in RAM), using the given @var{event}, a pointer to
			
 
				-a cl_event. Return 0 on success.
			
 
				-
			
 
				-@item @code{int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
			
 
				-on the @var{dst_node} node (on another OpenCL device), using the given
			
 
				-@var{event}, a pointer to a cl_event. Return 0 on success.
			
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
 
				+@deftypefun int starpu_interface_copy (uintptr_t @var{src}, size_t @var{src_offset}, unsigned @var{src_node}, uintptr_t @var{dst}, size_t @var{dst_offset}, unsigned @var{dst_node}, size_t @var{size}, {void *}@var{async_data})
			
 
				+Copy @var{size} bytes from byte offset @var{src_offset} of @var{src} on
			
 
				+@var{src_node} to byte offset @var{dst_offset} of @var{dst} on @var{dst_node}.
			
 
				+This is to be used in the @var{any_to_any} copy method, which is provided with
			
 
				+the @var{async_data} to be pased to @var{starpu_interface_copy}. this returns
			
 
				+-EAGAIN if the transfer is still ongoing, or 0 if the transfer is already
			
 
				+completed.
			
 
				+@end deftypefun
			
 
				+
			
 
				+
			
 
				 @deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
			
 
				 Compute the CRC of a byte buffer seeded by the inputcrc "current
			
 
				 state". The return value should be considered as the new "current
			
@@ -457,7 +450,7 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
 
				                 .nx = nx
			
 
				         @};
			
 
				 
			
 
				-        if (interface_complex_ops.interfaceid == -1)
			
 
				+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
			
 
				         @{
			
 
				                 interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				         @}
			
@@ -483,7 +476,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
				         .copy_methods = &complex_copy_methods,
			
 
				         .get_size = complex_get_size,
			
 
				         .footprint = complex_footprint,
			
 
				-        .interfaceid = -1,
			
 
				+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
			
 
				         .interface_size = sizeof(struct starpu_complex_interface),
			
 
				 @};
			
 
				 @end smallexample
			
@@ -837,7 +830,7 @@ The number of workerids
 
				 @item @code{pthread_key_t cursor_key} (optional)
			
 
				 The cursor needed to iterate the collection (depending on the data structure)
			
 
				 @item @code{int type}
			
 
				-The type of structure (currently STARPU_WORKER_LIST is the only one available)
			
 
				+The type of structure (currently STARPU_SCHED_CTX_WORKER_LIST is the only one available)
			
 
				 @item @code{unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers)}
			
 
				 Checks if there is a next worker
			
 
				 @item @code{int (*get_next)(struct starpu_sched_ctx_worker_collection *workers)}
			
@@ -870,15 +863,15 @@ Delete the worker collection of the specified scheduling context
 
				 Return the worker collection managed by the indicated context
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun pthread_mutex_t* starpu_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
			
 
				+@deftypefun pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
			
 
				 TODO
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_task_set_context (unsigned *@var{sched_ctx_id})
			
 
				+@deftypefun void starpu_sched_ctx_set_context (unsigned *@var{sched_ctx_id})
			
 
				 Set the scheduling context the subsequent tasks will be submitted to
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun unsigned starpu_task_get_context (void)
			
 
				+@deftypefun unsigned starpu_sched_ctx_get_context (void)
			
 
				 Return the scheduling context the tasks are currently submitted to
			
 
				 @end deftypefun
			
 
				 
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -1849,6 +1849,11 @@ A pointer to the next task. This should only be used by StarPU.
 
				 This is only used for tasks that use multiformat handle. This should only be
			
 
				 used by StarPU.
			
 
				 
			
 
				+@item @code{double flops}
			
 
				+This can be set to the number of floating points operations that the task
			
 
				+will have to achieve. This is useful for easily getting GFlops curves from
			
 
				+@code{starpu_perfmodel_plot}, and for the hypervisor load balancing.
			
 
				+
			
 
				 @item @code{void *starpu_private}
			
 
				 This is private to StarPU, do not modify. If the task is allocated by hand
			
 
				 (without starpu_task_create), this field should be set to NULL.
			
@@ -1857,6 +1862,7 @@ This is private to StarPU, do not modify. If the task is allocated by hand
 
				 This field is set when initializing a task. It prevents a task from being
			
 
				 submitted if it has not been properly initialized.
			
 
				 @end table
			
 
				+
			
 
				 @end deftp
			
 
				 
			
 
				 @deftypefun void starpu_task_init ({struct starpu_task} *@var{task})
			
@@ -1939,6 +1945,18 @@ This function blocks until all the tasks that were submitted are terminated. It
 
				 does not destroy these tasks.
			
 
				 @end deftypefun
			
 
				 
			
 
				+@deftypefun int starpu_task_nready (void)
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_task_nsubmitted (void)
			
 
				+Return the number of submitted tasks which have not completed yet.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_task_nready (void)
			
 
				+Return the number of submitted tasks which are ready for execution are already
			
 
				+executing. It thus does not include tasks waiting for dependencies.
			
 
				+@end deftypefun
			
 
				+
			
 
				 @deftypefun {struct starpu_task *} starpu_task_get_current (void)
			
 
				 This function returns the task currently executed by the worker, or
			
 
				 NULL if it is called either from a thread that is not a task or simply
			
@@ -2489,10 +2507,6 @@ This function returns a pointer to device properties for worker @var{workerid}
 
				 (assumed to be a CUDA worker).
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun size_t starpu_cuda_get_global_mem_size (unsigned @var{devid})
			
 
				-Return the size of the global memory of CUDA device @var{devid}.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @deftypefun void starpu_cuda_report_error ({const char *}@var{func}, {const char *}@var{file}, int @var{line}, cudaError_t @var{status})
			
 
				 Report a CUDA error.
			
 
				 @end deftypefun
			
@@ -2560,10 +2574,6 @@ OpenCL as shown in @ref{Full source code for the 'Scaling a Vector' example}.
 
				 @node Writing OpenCL kernels
			
 
				 @subsection Writing OpenCL kernels
			
 
				 
			
 
				-@deftypefun size_t starpu_opencl_get_global_mem_size (int @var{devid})
			
 
				-Return the size of global device memory in bytes.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @deftypefun void starpu_opencl_get_context (int @var{devid}, {cl_context *}@var{context})
			
 
				 Places the OpenCL context of the device designated by @var{devid} into @var{context}.
			
 
				 @end deftypefun
			
@@ -2780,7 +2790,7 @@ otherwise. The integer pointed to by @var{ret} is set to -EAGAIN if the asynchro
 
				 was successful, or to 0 if event was NULL.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun cl_int starpu_opencl_copy_async_sync (cl_mem @var{src}, unsigned @var{src_node}, size_t @var{src_offset}, cl_mem @var{dst}, unsigned @var{dst_node}, size_t @var{dst_offset}, size_t @var{size}, {cl_event *}@var{event})
			
 
				+@deftypefun cl_int starpu_opencl_copy_async_sync (uintptr_t @var{src}, size_t @var{src_offset}, unsigned @var{src_node}, uintptr_t @var{dst}, size_t @var{dst_offset}, unsigned @var{dst_node}, size_t @var{size}, {cl_event *}@var{event})
			
 
				 Copy @var{size} bytes from byte offset @var{src_offset} of
			
 
				 @var{src} on @var{src_node} to byte offset @var{dst_offset} of @var{dst} on
			
 
				 @var{dst_node}. if @var{event} is NULL, the copy is synchronous, i.e the queue is
			
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -417,8 +417,20 @@ the @code{starpu_conf} structure passed to @code{starpu_init} is set.
 
				 
			
 
				 @defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
			
 
				 If set, StarPU will create several workers which won't be able to work
			
 
				-concurrently. It will create combined workers which size goes from 1 to the
			
 
				-total number of CPU workers in the system.
			
 
				+concurrently. It will by default create combined workers which size goes from 1
			
 
				+to the total number of CPU workers in the system. @code{STARPU_MIN_WORKERSIZE}
			
 
				+and @code{STARPU_MAX_WORKERSIZE} can be used to change this default.
			
 
				+@end defvr
			
 
				+
			
 
				+@defvr {Environment variable} @code{STARPU_MIN_WORKERSIZE}
			
 
				+When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MIN_WORKERSIZE}
			
 
				+permits to specify the minimum size of the combined workers (instead of the default 1)
			
 
				+@end defvr
			
 
				+
			
 
				+@defvr {Environment variable} @code{STARPU_MAX_WORKERSIZE}
			
 
				+When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MAX_WORKERSIZE}
			
 
				+permits to specify the minimum size of the combined workers (instead of the
			
 
				+number of CPU workers in the system)
			
 
				 @end defvr
			
 
				 
			
 
				 @defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
			
--- a/doc/chapters/mpi-support.texi
+++ b/doc/chapters/mpi-support.texi
@@ -21,6 +21,7 @@ according to the task graph and an application-provided distribution.
 
				 
			
 
				 @menu
			
 
				 * Simple Example::
			
 
				+* Point to point communication::
			
 
				 * Exchanging User Defined Data Interface::
			
 
				 * MPI Insert Task Utility::
			
 
				 * MPI Collective Operations::
			
@@ -120,7 +121,49 @@ int main(int argc, char **argv)
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				-@page
			
 
				+@node Point to point communication
			
 
				+@section Point to point communication
			
 
				+
			
 
				+The standard point to point communications of MPI have been
			
 
				+implemented. The semantic is similar to the MPI one, but adapted to
			
 
				+the DSM provided by StarPU. A MPI request will only be submitted when
			
 
				+the data is available in the main memory of the node submitting the
			
 
				+request.
			
 
				+
			
 
				+There is two types of asynchronous communications: the classic
			
 
				+asynchronous communications and the detached communications. The
			
 
				+classic asynchronous communications (@code{starpu_mpi_isend} and
			
 
				+@code{starpu_mpi_irecv}) need to be followed by a call to
			
 
				+@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
			
 
				+test the completion of the communication. Waiting for or testing the
			
 
				+completion of detached communications is not possible, this is done
			
 
				+internally by StarPU-MPI, on completion, the resources are
			
 
				+automatically released. This mechanism is similar to the pthread
			
 
				+detach state attribute which determines whether a thread will be
			
 
				+created in a joinable or a detached state.
			
 
				+
			
 
				+For any communication, the call of the function will result in the
			
 
				+creation of a StarPU-MPI request, the function
			
 
				+@code{starpu_data_acquire_cb} is then called to asynchronously request
			
 
				+StarPU to fetch the data in main memory; when the data is available in
			
 
				+main memory, a StarPU-MPI function is called to put the new request in
			
 
				+the list of the ready requests.
			
 
				+
			
 
				+The StarPU-MPI progression thread regularly polls this list of ready
			
 
				+requests. For each new ready request, the appropriate function is
			
 
				+called to post the corresponding MPI call. For example, calling
			
 
				+@code{starpu_mpi_isend} will result in posting @code{MPI_Isend}. If
			
 
				+the request is marked as detached, the request will be put in the list
			
 
				+of detached requests.
			
 
				+
			
 
				+The StarPU-MPI progression thread also polls the list of detached
			
 
				+requests. For each detached request, it regularly tests the completion
			
 
				+of the MPI request by calling @code{MPI_Test}. On completion, the data
			
 
				+handle is released, and if a callback was defined, it is called.
			
 
				+
			
 
				+@ref{Communication} gives the list of all the point to point
			
 
				+communications defined in StarPU-MPI.
			
 
				+
			
 
				 @node Exchanging User Defined Data Interface
			
 
				 @section Exchanging User Defined Data Interface
			
 
				 
			
--- a/doc/chapters/perf-feedback.texi
+++ b/doc/chapters/perf-feedback.texi
@@ -411,7 +411,7 @@ display the regression formula, and in the case of non-linear regression, the
 
				 same performance log as for history-based performance models:
			
 
				 
			
 
				 @example
			
 
				-$ starpu_perfmodel_display -s non_linear_memset_regression_based.type
			
 
				+$ starpu_perfmodel_display -s non_linear_memset_regression_based
			
 
				 performance model for cpu_impl_0
			
 
				 	Regression : #sample = 1400
			
 
				 	Linear: y = alpha size ^ beta
			
@@ -429,15 +429,25 @@ a3d3725e	4096           	4.763200e+00   	7.650928e-01   	100
 
				 ...
			
 
				 @end example
			
 
				 
			
 
				-The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
			
 
				-It writes a @code{.gp} file in the current directory, to be run in the
			
 
				-@code{gnuplot} tool, which shows the corresponding curve.
			
 
				-
			
 
				 The same can also be achieved by using StarPU's library API, see
			
 
				 @ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
			
 
				 function. The source code of the @code{starpu_perfmodel_display} tool can be a
			
 
				 useful example.
			
 
				 
			
 
				+The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
			
 
				+It writes a @code{.gp} file in the current directory, to be run in the
			
 
				+@code{gnuplot} tool, which shows the corresponding curve.
			
 
				+
			
 
				+When the @code{flops} field of tasks is set, @code{starpu_perfmodel_plot} can
			
 
				+directly draw a GFlops curve, by simply adding the @code{-f} option:
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_display -f -s chol_model_11
			
 
				+@end example
			
 
				+
			
 
				+This will however disable displaying the regression model, for which we can not
			
 
				+compute GFlops.
			
 
				+
			
 
				 When the FxT trace file @code{filename} has been generated, it is possible to
			
 
				 get a profiling of each codelet by calling:
			
 
				 @example
			
@@ -453,10 +463,10 @@ This is also available in the @code{starpu_perfmodel_plot} tool, by passing it
 
				 the fxt trace:
			
 
				 
			
 
				 @example
			
 
				-$ starpu_perfmodel_display -s non_linear_memset_regression_based.type -i /tmp/prof_file_foo_0
			
 
				+$ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_foo_0
			
 
				 @end example
			
 
				 
			
 
				-It willd produce a @code{.gp} file which contains both the performance model
			
 
				+It will produce a @code{.gp} file which contains both the performance model
			
 
				 curves, and the profiling measurements.
			
 
				 
			
 
				 If you have the R statistical tool installed, you can additionally use
			
--- a/doc/chapters/perf-optimization.texi
+++ b/doc/chapters/perf-optimization.texi
@@ -228,7 +228,7 @@ int workerids[3] = @{1, 3, 10@};
 
				 int id_ctx = starpu_sched_ctx_create("heft", workerids, 3, "my_ctx");
			
 
				 
			
 
				 /* @b{let StarPU know that the folowing tasks will be submitted to this context} */
			
 
				-starpu_task_set_context(id);
			
 
				+starpu_sched_ctx_set_task_context(id);
			
 
				 
			
 
				 /* @b{submit the task to StarPU} */
			
 
				 starpu_task_submit(task);
			
@@ -548,6 +548,11 @@ The number of devices can be chosen as usual with @code{STARPU_NCPU},
 
				 cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
			
 
				 lower than the real number on the current machine.
			
 
				 
			
 
				+The amount of simulated GPU memory is for now unbound by default, but
			
 
				+it can be chosen by hand through the @code{STARPU_LIMIT_CUDA_MEM},
			
 
				+@code{STARPU_LIMIT_CUDA_devid_MEM}, @code{STARPU_LIMIT_OPENCL_MEM}, and
			
 
				+@code{STARPU_LIMIT_OPENCL_devid_MEM} environment variables.
			
 
				+
			
 
				 The Simgrid default stack size is small; to increase it use the
			
 
				 parameter @code{--cfg=contexts/stack_size}, for example:
			
 
				 
			
--- a/doc/chapters/sched_ctx_hypervisor.texi
+++ b/doc/chapters/sched_ctx_hypervisor.texi
@@ -27,7 +27,7 @@ Basic strategies of resizing scheduling contexts already exist but a platform fo
 
				 @section Managing the hypervisor
			
 
				 There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
			
 
				 
			
 
				-@deftypefun {struct starpu_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
			
 
				+@deftypefun {struct starpu_sched_ctx_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
			
 
				 Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
			
 
				 These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
			
 
				 @end deftypefun
			
@@ -200,7 +200,7 @@ or
 
				 @smallexample
			
 
				 starpu_insert_task(&codelet,
			
 
				                     ...,
			
 
				-                    STARPU_FLOPS, 100,
			
 
				+                    STARPU_FLOPS, (double) 100,
			
 
				                     0);
			
 
				 @end smallexample
			
 
				 @end cartouche
			
@@ -210,8 +210,8 @@ starpu_insert_task(&codelet,
 
				 
			
 
				 The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
			
 
				 
			
 
				-@deftp {Data Type} {struct starpu_performance_counters}
			
 
				-@anchor{struct starpu_performance_counters}
			
 
				+@deftp {Data Type} {struct starpu_sched_ctx_performance_counters}
			
 
				+@anchor{struct starpu_sched_ctx_performance_counters}
			
 
				 
			
 
				 @table @asis
			
 
				 @item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -16,7 +16,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) -Werror=implicit
			
 
				+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
--- a/examples/basic_examples/block_cpu.c
+++ b/examples/basic_examples/block_cpu.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -26,7 +26,7 @@ void cpu_codelet(void *descr[], void *_args)
 
				         unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]);
			
 
				         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
			
 
				         float *multiplier = (float *)_args;
			
 
				-        unsigned i, j, k;
			
 
				+        int i, j, k;
			
 
				 
			
 
				         for(k=0; k<nz ; k++)
			
 
				 	{
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -55,6 +55,64 @@
 
				 #define BLAS3_FLOP(n1,n2,n3)    \
			
 
				         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
			
 
				 
			
 
				+/* This is from magma
			
 
				+
			
 
				+  -- Innovative Computing Laboratory
			
 
				+  -- Electrical Engineering and Computer Science Department
			
 
				+  -- University of Tennessee
			
 
				+  -- (C) Copyright 2009
			
 
				+
			
 
				+  Redistribution  and  use  in  source and binary forms, with or without
			
 
				+  modification,  are  permitted  provided  that the following conditions
			
 
				+  are met:
			
 
				+
			
 
				+  * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+    notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+  * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+    notice,  this list of conditions and the following disclaimer in the
			
 
				+    documentation  and/or other materials provided with the distribution.
			
 
				+  * Neither  the  name of the University of Tennessee, Knoxville nor the
			
 
				+    names of its contributors may be used to endorse or promote products
			
 
				+    derived from this software without specific prior written permission.
			
 
				+
			
 
				+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
			
 
				+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
			
 
				+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+  */
			
 
				+
			
 
				+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
			
 
				+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
			
 
				+
			
 
				+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
			
 
				+
			
 
				+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
			
 
				+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
			
 
				+
			
 
				+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
			
 
				+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
			
 
				+
			
 
				+#define FMULS_TRSM FMULS_TRMM
			
 
				+#define FADDS_TRSM FMULS_TRMM
			
 
				+
			
 
				+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
			
 
				+
			
 
				+
			
 
				+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
			
 
				+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
			
 
				+
			
 
				+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
			
 
				+
			
 
				+/* End of magma code */
			
 
				+
			
 
				 static unsigned size = 4*1024;
			
 
				 static unsigned nblocks = 16;
			
 
				 static unsigned nbigblocks = 8;
			
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -68,6 +68,9 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 
				 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_SPOTRF(n);
			
 
				+
			
 
				 	return task;
			
 
				 }
			
 
				 
			
@@ -110,6 +113,9 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 
				 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_STRSM(n, n);
			
 
				+
			
 
				 	ret = starpu_task_submit(task);
			
 
				 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	return ret;
			
@@ -157,6 +163,9 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 
				 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_SGEMM(n, n, n);
			
 
				+
			
 
				 	ret = starpu_task_submit(task);
			
 
				 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	return ret;
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -85,6 +85,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 	double end;
			
 
				 
			
 
				 	unsigned i,j,k;
			
 
				+	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				+	unsigned long nn = n/nblocks;
			
 
				 
			
 
				 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
			
 
				 
			
@@ -101,6 +103,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 					 STARPU_PRIORITY, prio_level,
			
 
				 					 STARPU_RW, sdatakk,
			
 
				 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				+					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
			
 
				 					 0);
			
 
				 		if (ret == -ENODEV) return 77;
			
 
				 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
@@ -113,6 +116,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				 						 STARPU_R, sdatakk,
			
 
				 						 STARPU_RW, sdatakj,
			
 
				+						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
			
 
				 						 0);
			
 
				 			if (ret == -ENODEV) return 77;
			
 
				 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
@@ -129,6 +133,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 								 STARPU_R, sdataki,
			
 
				 								 STARPU_R, sdatakj,
			
 
				 								 STARPU_RW, sdataij,
			
 
				+								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
			
 
				 								 0);
			
 
				 					if (ret == -ENODEV) return 77;
			
 
				 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
@@ -144,9 +149,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	double timing = end - start;
			
 
				-	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				 
			
 
				-	double flop = (1.0f*n*n*n)/3.0f;
			
 
				+	double flop = FLOPS_SPOTRF(n);
			
 
				 
			
 
				 	if(with_ctxs || with_noctxs || chole1 || chole2)
			
 
				 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
			
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -69,6 +69,9 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 
				 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_SPOTRF(n);
			
 
				+
			
 
				 	return task;
			
 
				 }
			
 
				 
			
@@ -109,6 +112,9 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 
				 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_STRSM(n, n);
			
 
				+
			
 
				 	int ret = starpu_task_submit(task);
			
 
				         if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
@@ -158,6 +164,9 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_SGEMM(n, n, n);
			
 
				+
			
 
				 	int ret = starpu_task_submit(task);
			
 
				         if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -71,6 +71,9 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 
				 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_SPOTRF(n);
			
 
				+
			
 
				 	return task;
			
 
				 }
			
 
				 
			
@@ -113,6 +116,9 @@ static int create_task_21(unsigned k, unsigned j)
 
				 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_STRSM(n, n);
			
 
				+
			
 
				 	ret = starpu_task_submit(task);
			
 
				 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	return ret;
			
@@ -160,6 +166,9 @@ static int create_task_22(unsigned k, unsigned i, unsigned j)
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				 
			
 
				+	int n = starpu_matrix_get_nx(task->handles[0]);
			
 
				+	task->flops = FLOPS_SGEMM(n, n, n);
			
 
				+
			
 
				 	ret = starpu_task_submit(task);
			
 
				 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	return ret;
			
--- a/examples/filters/custom_mf/custom_conversion_codelets.c
+++ b/examples/filters/custom_mf/custom_conversion_codelets.c
@@ -21,7 +21,7 @@
 
				 #ifdef STARPU_USE_CUDA
			
 
				 void cuda_to_cpu(void *buffers[], void *arg)
			
 
				 {
			
 
				-	unsigned int n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	int n = CUSTOM_GET_NX(buffers[0]);
			
 
				 	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
			
 
				 	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
			
 
				 	struct point *aop;
			
@@ -60,7 +60,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
			
 
				 {
			
 
				-	unsigned int n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	int n = CUSTOM_GET_NX(buffers[0]);
			
 
				 	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
			
 
				 	struct point *aop;
			
 
				 	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -46,7 +46,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 
				 				    cl_event *event);
			
 
				 #endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				-static struct starpu_data_copy_methods custom_copy_data_methods_s =
			
 
				+static const struct starpu_data_copy_methods custom_copy_data_methods_s =
			
 
				 {
			
 
				 	.ram_to_ram = NULL,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -98,7 +98,7 @@ static struct starpu_data_interface_ops interface_custom_ops =
 
				 	.get_size              = custom_interface_get_size,
			
 
				 	.footprint             = footprint_custom_interface_crc32,
			
 
				 	.compare               = NULL,
			
 
				-	.interfaceid           = -1,
			
 
				+	.interfaceid           = STARPU_UNKNOWN_INTERFACE_ID,
			
 
				 	.interface_size        = sizeof(struct custom_data_interface),
			
 
				 	.display               = display_custom_interface,
			
 
				 	.is_multiformat        = 1,
			
@@ -276,7 +276,8 @@ void custom_data_register(starpu_data_handle_t *handle,
 
				 		.ops = format_ops
			
 
				 	};
			
 
				 
			
 
				-	if (interface_custom_ops.interfaceid == -1) {
			
 
				+	if (interface_custom_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
			
 
				+	{
			
 
				 		interface_custom_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				 	}
			
 
				 	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);
			
--- a/examples/filters/fblock_cpu.c
+++ b/examples/filters/fblock_cpu.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,7 +18,7 @@
 
				 
			
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				-        unsigned i, j, k;
			
 
				+        int i, j, k;
			
 
				         int *factor = (int *) cl_arg;
			
 
				 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
			
 
				 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
			
--- a/examples/filters/fblock_opencl.c
+++ b/examples/filters/fblock_opencl.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -17,13 +17,13 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
			
 
				-do                                                          \
			
 
				-{							    \
			
 
				-	int err;                                            \
			
 
				-	err = clSetKernelArg(kernel, n, size, ptr);         \
			
 
				-	if (err != CL_SUCCESS)                              \
			
 
				-       		STARPU_OPENCL_REPORT_ERROR(err);            \
			
 
				+#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       	\
			
 
				+do                                                          	\
			
 
				+{								\
			
 
				+	int check_err;                           	        \
			
 
				+	check_err = clSetKernelArg(kernel, n, size, ptr);       \
			
 
				+	if (check_err != CL_SUCCESS)                            \
			
 
				+       		STARPU_OPENCL_REPORT_ERROR(check_err);          \
			
 
				 } while (0)
			
 
				 
			
 
				 extern struct starpu_opencl_program opencl_program;
			
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -43,9 +43,9 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	unsigned i, j, n=1;
			
 
				+	unsigned j, n=1;
			
 
				         int matrix[NX*NY];
			
 
				-	int ret;
			
 
				+	int ret, i;
			
 
				 
			
 
				         FPRINTF(stderr,"IN  Matrix: \n");
			
 
				         for(j=0 ; j<NY ; j++)
			
--- a/examples/filters/fvector.c
+++ b/examples/filters/fvector.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -37,7 +37,7 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	unsigned i;
			
 
				+	int i;
			
 
				         int vector[NX];
			
 
				         starpu_data_handle_t handle;
			
 
				         int factor=1;
			
--- a/examples/filters/shadow.c
+++ b/examples/filters/shadow.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -91,11 +91,11 @@ void cuda_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	unsigned i, j;
			
 
				+	unsigned j;
			
 
				         int vector[NX + 2*SHADOW];
			
 
				         int vector2[NX + PARTS*2*SHADOW];
			
 
				 	starpu_data_handle_t handle, handle2;
			
 
				-	int ret;
			
 
				+	int ret, i;
			
 
				 
			
 
				         struct starpu_codelet cl =
			
 
				 	{
			
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -146,139 +146,30 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node,
			
 
				+			   void *dst_interface, unsigned dst_node,
			
 
				+			   void *async_data)
			
 
				 {
			
 
				 	struct starpu_complex_interface *src_complex = src_interface;
			
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				-
			
 
				-	cudaStream_t sstream = stream;
			
 
				-	int ret;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_complex->real, src_node, (void *)dst_complex->real, dst_node,
			
 
				-					  src_complex->nx*sizeof(src_complex->real[0]), sstream, kind);
			
 
				-	if (ret == 0) sstream = NULL;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((char *)src_complex->imaginary, src_node, (char *)dst_complex->imaginary, dst_node,
			
 
				-					  src_complex->nx*sizeof(src_complex->imaginary[0]), sstream, kind);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (starpu_interface_copy((uintptr_t) src_complex->real, 0, src_node,
			
 
				+				    (uintptr_t) dst_complex->real, 0, dst_node,
			
 
				+				     src_complex->nx*sizeof(src_complex->real[0]),
			
 
				+				     async_data))
			
 
				+		ret = -EAGAIN;
			
 
				+	if (starpu_interface_copy((uintptr_t) src_complex->imaginary, 0, src_node,
			
 
				+				    (uintptr_t) dst_complex->imaginary, 0, dst_node,
			
 
				+				     src_complex->nx*sizeof(src_complex->imaginary[0]),
			
 
				+				     async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-     return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	struct starpu_complex_interface *src_complex = src_interface;
			
 
				-	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				-	cl_int err;
			
 
				-	int ret;
			
 
				-
			
 
				-	err = starpu_opencl_copy_ram_to_opencl(src_complex->real,
			
 
				-					       src_node,
			
 
				-					       (cl_mem) dst_complex->real,
			
 
				-					       dst_node,
			
 
				-					       src_complex->nx * sizeof(src_complex->real[0]),
			
 
				-					       0,
			
 
				-					       event,
			
 
				-					       &ret);
			
 
				-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-	if (ret == 0)
			
 
				-		event = NULL;
			
 
				-
			
 
				-	err = starpu_opencl_copy_ram_to_opencl(src_complex->imaginary,
			
 
				-					       src_node,
			
 
				-					       (cl_mem) dst_complex->imaginary,
			
 
				-					       dst_node,
			
 
				-					       src_complex->nx * sizeof(src_complex->imaginary[0]),
			
 
				-					       0,
			
 
				-					       event,
			
 
				-					       &ret);
			
 
				-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	struct starpu_complex_interface *src_complex = src_interface;
			
 
				-	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				-	cl_int err;
			
 
				-	int ret;
			
 
				-
			
 
				-	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->real,
			
 
				-					       src_node,
			
 
				-					       dst_complex->real,
			
 
				-					       dst_node,
			
 
				-					       src_complex->nx * sizeof(src_complex->real[0]),
			
 
				-					       0,
			
 
				-					       event,
			
 
				-					       &ret);
			
 
				-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-	if (ret == 0)
			
 
				-		event = NULL;
			
 
				-
			
 
				-	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->imaginary,
			
 
				-					       src_node,
			
 
				-					       dst_complex->imaginary,
			
 
				-					       dst_node,
			
 
				-					       src_complex->nx * sizeof(src_complex->imaginary[0]),
			
 
				-					       0,
			
 
				-					       event,
			
 
				-					       &ret);
			
 
				-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static struct starpu_data_copy_methods complex_copy_methods =
			
 
				+static const struct starpu_data_copy_methods complex_copy_methods =
			
 
				 {
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				-#endif
			
 
				+	.any_to_any = copy_any_to_any
			
 
				 };
			
 
				 
			
 
				 static struct starpu_data_interface_ops interface_complex_ops =
			
@@ -289,7 +180,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
				 	.copy_methods = &complex_copy_methods,
			
 
				 	.get_size = complex_get_size,
			
 
				 	.footprint = complex_footprint,
			
 
				-	.interfaceid = -1,
			
 
				+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
			
 
				 	.interface_size = sizeof(struct starpu_complex_interface),
			
 
				 	.handle_to_pointer = complex_handle_to_pointer,
			
 
				 	.pack_data = complex_pack_data,
			
@@ -305,7 +196,7 @@ void starpu_complex_data_register(starpu_data_handle_t *handleptr, unsigned home
 
				 		.nx = nx
			
 
				 	};
			
 
				 
			
 
				-	if (interface_complex_ops.interfaceid == -1)
			
 
				+	if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
			
 
				 	{
			
 
				 		interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				 	}
			
--- a/examples/ppm_downscaler/ppm_downscaler.c
+++ b/examples/ppm_downscaler/ppm_downscaler.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -75,7 +75,7 @@ struct ppm_image *file_to_ppm(char *filename)
 
				 	ret = fread(ppm->data, sizeof(struct ppm_color), ppm->ncols*ppm->nlines, file);
			
 
				 	STARPU_ASSERT(ret == ppm->ncols*ppm->nlines);
			
 
				 
			
 
				-	unsigned i;
			
 
				+	int i;
			
 
				 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
			
 
				 	{
			
 
				 /*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
			
@@ -121,7 +121,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 
				 	struct ppm_color *in = input_ppm->data;
			
 
				 	struct ppm_color *out = output_ppm->data;
			
 
				 
			
 
				-	unsigned line, col;
			
 
				+	int line, col;
			
 
				 	for (line = 0; line < output_ppm->nlines; line++)
			
 
				 	{
			
 
				 		for (col = 0; col < output_ppm->ncols; col++)
			
--- a/examples/profiling/profiling.c
+++ b/examples/profiling/profiling.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -107,7 +107,7 @@ int main(int argc, char **argv)
 
				 	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
			
 
				 
			
 
				 	/* Display the occupancy of all workers during the test */
			
 
				-	int worker;
			
 
				+	unsigned worker;
			
 
				 	for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				 	{
			
 
				 		struct starpu_worker_profiling_info worker_info;
			
--- a/examples/sched_ctx/sched_ctx.c
+++ b/examples/sched_ctx/sched_ctx.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 
				 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
			
 
				 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
			
 
				 
			
 
				-	unsigned i;
			
 
				+	int i;
			
 
				 	for (i = 0; i < ntasks/2; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
--- a/examples/sched_ctx_utils/sched_ctx_utils.c
+++ b/examples/sched_ctx_utils/sched_ctx_utils.c
@@ -93,7 +93,7 @@ void* start_bench(void *val)
 
				 	pthread_setspecific(key, &p->id);
			
 
				 
			
 
				 	if(p->ctx != 0)
			
 
				-		starpu_task_set_context(&p->ctx);
			
 
				+		starpu_sched_ctx_set_context(&p->ctx);
			
 
				 
			
 
				 	for(i = 0; i < NSAMPLES; i++)
			
 
				 		p->bench(p->size, p->nblocks);
			
--- a/examples/scheduler/dummy_sched.c
+++ b/examples/scheduler/dummy_sched.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,7 +28,7 @@ typedef struct dummy_sched_data {
 
				 
			
 
				 static void init_dummy_sched(unsigned sched_ctx_id)
			
 
				 {
			
 
				-	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
			
 
				+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_SCHED_CTX_WORKER_LIST);
			
 
				 
			
 
				 	struct dummy_sched_data *data = (struct dummy_sched_data*)malloc(sizeof(struct dummy_sched_data));
			
 
				 	
			
@@ -70,7 +70,7 @@ static int push_task_dummy(struct starpu_task *task)
 
				 	   of them would pop for tasks */
			
 
				 	unsigned worker = 0;
			
 
				 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
@@ -159,7 +159,7 @@ int main(int argc, char **argv)
 
				 	ntasks /= 100;
			
 
				 #endif
			
 
				 
			
 
				-	unsigned i;
			
 
				+	int i;
			
 
				 	for (i = 0; i < ntasks; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -13,10 +13,10 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
			
 
				+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
			
 
				 
			
 
				 if USE_MPI
			
 
				 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
--- a/examples/stencil/life.c
+++ b/examples/stencil/life.c
@@ -20,7 +20,7 @@
 
				 
			
 
				 void life_update(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter)
			
 
				 {
			
 
				-	unsigned x, y, z, num, alive;
			
 
				+	int x, y, z, num, alive;
			
 
				 
			
 
				 	for (z = iter; z < nz - iter; z++)
			
 
				 	{
			
--- a/examples/stencil/stencil-blocks.c
+++ b/examples/stencil/stencil-blocks.c
@@ -121,7 +121,7 @@ struct block_description *get_block_description(int z)
 
				 	return &blocks[z];
			
 
				 }
			
 
				 
			
 
				-unsigned get_block_mpi_node(int z)
			
 
				+int get_block_mpi_node(int z)
			
 
				 {
			
 
				 	z = (z + nbz)%nbz;
			
 
				 	return blocks[z].mpi_node;
			
@@ -277,7 +277,7 @@ void allocate_memory_on_node(int rank)
 
				 	{
			
 
				 		struct block_description *block = get_block_description(bz);
			
 
				 
			
 
				-		unsigned node = block->mpi_node;
			
 
				+		int node = block->mpi_node;
			
 
				 
			
 
				 		unsigned size_bz = block_sizes_z[bz];
			
 
				 	
			
@@ -301,7 +301,7 @@ void allocate_memory_on_node(int rank)
 
				 		}
			
 
				 
			
 
				 		/* Boundary blocks : Top */
			
 
				-		unsigned top_node = block->boundary_blocks[T]->mpi_node;
			
 
				+		int top_node = block->boundary_blocks[T]->mpi_node;
			
 
				 		if ((node == rank) || (top_node == rank))
			
 
				 		{
			
 
				 			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
			
@@ -311,7 +311,7 @@ void allocate_memory_on_node(int rank)
 
				 		} 
			
 
				 
			
 
				 		/* Boundary blocks : Bottom */
			
 
				-		unsigned bottom_node = block->boundary_blocks[B]->mpi_node;
			
 
				+		int bottom_node = block->boundary_blocks[B]->mpi_node;
			
 
				 		if ((node == rank) || (bottom_node == rank))
			
 
				 		{
			
 
				 			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
			
@@ -330,7 +330,7 @@ void check(int rank)
 
				 	{
			
 
				 		struct block_description *block = get_block_description(bz);
			
 
				 
			
 
				-		unsigned node = block->mpi_node;
			
 
				+		int node = block->mpi_node;
			
 
				 
			
 
				 		/* Main blocks */
			
 
				 		if (node == rank)
			
--- a/examples/stencil/stencil-tasks.c
+++ b/examples/stencil/stencil-tasks.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -40,7 +40,7 @@
 
				  */
			
 
				 
			
 
				 /* R(z) = R(z+d) = local, just call the save kernel */
			
 
				-static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned local_rank)
			
 
				+static void create_task_save_local(unsigned iter, unsigned z, int dir, int local_rank)
			
 
				 {
			
 
				 	struct starpu_task *save_task = starpu_task_create();
			
 
				 	struct block_description *descr = get_block_description(z);
			
@@ -81,7 +81,7 @@ static void send_done(void *arg)
 
				 
			
 
				 #ifdef STARPU_USE_MPI
			
 
				 /* Post MPI send */
			
 
				-static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsigned local_rank)
			
 
				+static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, int local_rank)
			
 
				 {
			
 
				 	struct block_description *descr = get_block_description(z);
			
 
				 	STARPU_ASSERT(descr->mpi_node == local_rank);
			
@@ -108,7 +108,7 @@ static void recv_done(void *arg)
 
				 }
			
 
				 
			
 
				 /* Post MPI recv */
			
 
				-static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsigned local_rank)
			
 
				+static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, int local_rank)
			
 
				 {
			
 
				 	struct block_description *descr = get_block_description(z);
			
 
				 	STARPU_ASSERT(descr->mpi_node != local_rank);
			
@@ -129,10 +129,10 @@ static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsign
 
				 /*
			
 
				  * Schedule saving boundaries of blocks to communication buffers
			
 
				  */
			
 
				-void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
			
 
				+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank)
			
 
				 {
			
 
				-	unsigned node_z = get_block_mpi_node(z);
			
 
				-	unsigned node_z_and_d = get_block_mpi_node(z+dir);
			
 
				+	int node_z = get_block_mpi_node(z);
			
 
				+	int node_z_and_d = get_block_mpi_node(z+dir);
			
 
				 
			
 
				 #ifdef STARPU_USE_MPI
			
 
				 	if (node_z == local_rank)
			
@@ -168,7 +168,7 @@ void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
 
				  * Schedule update computation in computation buffer
			
 
				  */
			
 
				 
			
 
				-void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
			
 
				+void create_task_update(unsigned iter, unsigned z, int local_rank)
			
 
				 {
			
 
				 	STARPU_ASSERT(iter != 0);
			
 
				 
			
@@ -253,8 +253,8 @@ void create_start_task(int z, int dir)
 
				  */
			
 
				 void create_tasks(int rank)
			
 
				 {
			
 
				-	unsigned iter;
			
 
				-	unsigned bz;
			
 
				+	int iter;
			
 
				+	int bz;
			
 
				 	int niter = get_niter();
			
 
				 	int nbz = get_nbz();
			
 
				 
			
@@ -288,7 +288,7 @@ void create_tasks(int rank)
 
				  */
			
 
				 void wait_end_tasks(int rank)
			
 
				 {
			
 
				-	unsigned bz;
			
 
				+	int bz;
			
 
				 	int nbz = get_nbz();
			
 
				 
			
 
				 	for (bz = 0; bz < nbz; bz++)
			
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -221,7 +221,7 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 #ifdef STARPU_USE_MPI
			
 
				-	starpu_mpi_initialize();
			
 
				+	starpu_mpi_init(NULL, NULL, 0);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -312,8 +312,8 @@ int main(int argc, char **argv)
 
				 #if 1
			
 
				 		unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
			
 
				 
			
 
				-		unsigned bz, iter;
			
 
				-		unsigned last;
			
 
				+		int iter;
			
 
				+		unsigned last, bz;
			
 
				 		for (iter = 0; iter < who_runs_what_len; iter++)
			
 
				 		{
			
 
				 			last = 1;
			
--- a/examples/stencil/stencil.h
+++ b/examples/stencil/stencil.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -56,8 +56,8 @@ typedef enum
 
				 struct block_description
			
 
				 {
			
 
				 	/* Which MPI node should process that block ? */
			
 
				-	unsigned mpi_node;
			
 
				-	
			
 
				+	int mpi_node;
			
 
				+
			
 
				 	unsigned preferred_worker;
			
 
				 
			
 
				 	unsigned bz;
			
@@ -101,7 +101,7 @@ void check(int rank);
 
				 
			
 
				 void display_memory_consumption(int rank);
			
 
				 
			
 
				-unsigned get_block_mpi_node(int z);
			
 
				+int get_block_mpi_node(int z);
			
 
				 unsigned get_block_size(int z);
			
 
				 unsigned get_bind_tasks(void);
			
 
				 
			
@@ -111,8 +111,8 @@ unsigned get_ticks(void);
 
				 
			
 
				 unsigned global_workerid(unsigned local_workerid);
			
 
				 
			
 
				-void create_task_update(unsigned iter, unsigned z, unsigned local_rank);
			
 
				-void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank);
			
 
				+void create_task_update(unsigned iter, unsigned z, int local_rank);
			
 
				+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank);
			
 
				 
			
 
				 extern int starpu_mpi_initialize(void);
			
 
				 extern int starpu_mpi_shutdown(void);
			
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -76,7 +76,7 @@ static void tag_cleanup_grid(unsigned ni, unsigned iter)
 
				 
			
 
				 static int create_task_grid(unsigned iter)
			
 
				 {
			
 
				-	int i;
			
 
				+	unsigned i;
			
 
				 	int ret;
			
 
				 
			
 
				 /*	FPRINTF(stderr, "start iter %d ni %d...\n", iter, ni); */
			
--- a/include/starpu_cuda.h
+++ b/include/starpu_cuda.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -39,7 +39,6 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 
				 #define STARPU_CUDA_REPORT_ERROR(status) \
			
 
				 	starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
			
 
				 
			
 
				-size_t starpu_cuda_get_global_mem_size(unsigned devid);
			
 
				 cudaStream_t starpu_cuda_get_local_stream(void);
			
 
				 
			
 
				 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -85,7 +85,7 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, e
 
				 void starpu_data_release(starpu_data_handle_t handle);
			
 
				 void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
			
 
				 
			
 
				-void starpu_malloc_set_align(size_t);
			
 
				+void starpu_malloc_set_align(size_t align);
			
 
				 int starpu_malloc(void **A, size_t dim);
			
 
				 int starpu_free(void *A);
			
 
				 void starpu_memory_display_stats();
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -73,10 +73,15 @@ struct starpu_data_copy_methods
 
				 	int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
			
 
				 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
			
 
				 #endif
			
 
				+
			
 
				+	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 };
			
 
				 
			
 
				+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
			
 
				+
			
 
				 enum starpu_data_interface_id
			
 
				 {
			
 
				+	STARPU_UNKNOWN_INTERFACE_ID = -1,
			
 
				 	STARPU_MATRIX_INTERFACE_ID=0,
			
 
				 	STARPU_BLOCK_INTERFACE_ID=1,
			
 
				 	STARPU_VECTOR_INTERFACE_ID=2,
			
@@ -99,7 +104,7 @@ struct starpu_data_interface_ops
 
				 	/* Free data of the interface on a given node. */
			
 
				 	void (*free_data_on_node)(void *data_interface, unsigned node);
			
 
				 	/* ram/cuda/opencl synchronous and asynchronous transfer methods */
			
 
				-	struct starpu_data_copy_methods *copy_methods;
			
 
				+	const struct starpu_data_copy_methods *copy_methods;
			
 
				 	/* Return the current pointer (if any) for the handle on the given node. */
			
 
				 	void * (*handle_to_pointer)(starpu_data_handle_t handle, unsigned node);
			
 
				 	/* Return an estimation of the size of data, for performance models */
			
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -53,7 +53,6 @@ struct starpu_opencl_program
 
				 	cl_program programs[STARPU_MAXOPENCLDEVS];
			
 
				 };
			
 
				 
			
 
				-size_t starpu_opencl_get_global_mem_size(int devid);
			
 
				 void starpu_opencl_get_context(int devid, cl_context *context);
			
 
				 void starpu_opencl_get_device(int devid, cl_device_id *device);
			
 
				 void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
			
@@ -108,7 +107,7 @@ cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *
 
				 
			
 
				 cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node, size_t src_offset, cl_mem dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event, int *ret);
			
 
				 
			
 
				-cl_int starpu_opencl_copy_async_sync(uintptr_t src, unsigned src_node, size_t src_offset, uintptr_t dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event);
			
 
				+cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, cl_event *event);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
@@ -102,6 +102,8 @@ struct starpu_perfmodel_history_entry
 
				 #else
			
 
				 	size_t size; /* in bytes */
			
 
				 #endif
			
 
				+
			
 
				+	double flops; /* Provided by the application */
			
 
				 };
			
 
				 
			
 
				 struct starpu_perfmodel_history_list
			
@@ -212,6 +214,10 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
				 void starpu_bus_print_bandwidth(FILE *f);
			
 
				 void starpu_bus_print_affinity(FILE *f);
			
 
				 
			
 
				+/* use bw & latency to compute the velocity of resources*/
			
 
				+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
			
 
				+double starpu_get_latency_RAM_CUDA(unsigned cudadev);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -24,12 +24,8 @@ extern "C"
 
				 {
			
 
				 #endif
			
 
				 
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#  warning rename all objects to start with starpu_sched_ctx
			
 
				-#endif
			
 
				-
			
 
				-//struct starpu_iterator;
			
 
				-struct starpu_iterator
			
 
				+//struct starpu_sched_ctx_iterator;
			
 
				+struct starpu_sched_ctx_iterator
			
 
				 {
			
 
				 	int cursor;
			
 
				 };
			
@@ -42,12 +38,12 @@ struct starpu_sched_ctx_worker_collection
 
				 	void *workerids;
			
 
				 	/* the number of workers in the collection */
			
 
				 	unsigned nworkers;
			
 
				-	/* the type of structure (STARPU_WORKER_LIST,...) */
			
 
				+	/* the type of structure (STARPU_SCHED_CTX_WORKER_LIST,...) */
			
 
				 	int type;
			
 
				 	/* checks if there is another element in collection */
			
 
				-	unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
			
 
				+	unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
			
 
				 	/* return the next element in the collection */
			
 
				-	int (*get_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
			
 
				+	int (*get_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
			
 
				 	/* add a new element in the collection */
			
 
				 	int (*add)(struct starpu_sched_ctx_worker_collection *workers, int worker);
			
 
				 	/* remove an element from the collection */
			
@@ -57,26 +53,26 @@ struct starpu_sched_ctx_worker_collection
 
				 	/* free the structure */
			
 
				 	void (*deinit)(struct starpu_sched_ctx_worker_collection *workers);
			
 
				 	/* initialize the cursor if there is one */
			
 
				-	void (*init_iterator)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
			
 
				+	void (*init_iterator)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
			
 
				 };
			
 
				 
			
 
				 /* types of structures the worker collection can implement */
			
 
				-#define STARPU_WORKER_LIST 0
			
 
				+#define STARPU_SCHED_CTX_WORKER_LIST 0
			
 
				 
			
 
				-struct starpu_performance_counters
			
 
				+struct starpu_sched_ctx_performance_counters
			
 
				 {
			
 
				 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
			
 
				 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
			
 
				 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
			
 
				-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops);
			
 
				+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops, size_t data_size);
			
 
				 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
			
 
				 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
			
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				-void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters);
			
 
				-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops);
			
 
				-void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
			
 
				+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
			
 
				+void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size);
			
 
				+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
 
				 unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
			
@@ -102,16 +98,16 @@ void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
 
				 struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
			
 
				 
			
 
				 #if !defined(_MSC_VER) && !defined(STARPU_SIMGRID)
			
 
				-pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
			
 
				+pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
			
 
				 #endif
			
 
				 
			
 
				-void starpu_task_set_context(unsigned *sched_ctx_id);
			
 
				+void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
			
 
				 
			
 
				-unsigned starpu_task_get_context(void);
			
 
				+unsigned starpu_sched_ctx_get_context(void);
			
 
				 
			
 
				-void starpu_notify_hypervisor_exists(void);
			
 
				+void starpu_sched_ctx_notify_hypervisor_exists(void);
			
 
				 
			
 
				-unsigned starpu_check_if_hypervisor_exists(void);
			
 
				+unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
			
 
				 
			
 
				 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
			
 
				 
			
@@ -121,13 +117,13 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id);
 
				 
			
 
				 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
			
 
				 
			
 
				-unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
			
 
				+unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
			
 
				 
			
 
				-void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
			
 
				+void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
			
 
				 
			
 
				-double starpu_get_max_time_worker_on_ctx(void);
			
 
				+double starpu_sched_ctx_get_max_time_worker_on_ctx(void);
			
 
				 
			
 
				-void starpu_stop_task_submission(void);
			
 
				+void starpu_sched_ctx_stop_task_submission(void);
			
 
				 
			
 
				 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
			
 
				 
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -321,6 +321,9 @@ int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx_id);
 
				 /* This function waits until there is no more ready task. */
			
 
				 int starpu_task_wait_for_no_ready(void);
			
 
				 
			
 
				+int starpu_task_nready(void);
			
 
				+int starpu_task_nsubmitted(void);
			
 
				+
			
 
				 void starpu_codelet_init(struct starpu_codelet *cl);
			
 
				 
			
 
				 void starpu_display_codelet_stats(struct starpu_codelet *cl);
			
--- a/include/starpu_task_util.h
+++ b/include/starpu_task_util.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -45,7 +45,7 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 
				 #define STARPU_DATA_ARRAY       (1<<11) /* Array of data handles */
			
 
				 #define STARPU_TAG       (1<<12) /* Tag */
			
 
				 #define STARPU_HYPERVISOR_TAG	(1<<13)	/* Used to tag a task after whose execution we'll execute  a code */
			
 
				-#define STARPU_HYPERVISOR_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
			
 
				+#define STARPU_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
			
 
				 
			
 
				 /* Wrapper to create a task. */
			
 
				 int starpu_insert_task(struct starpu_codelet *cl, ...);
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -207,17 +207,17 @@ static __inline int starpu_get_env_number(const char *str)
 
				 	if (strval)
			
 
				 	{
			
 
				 		/* the env variable was actually set */
			
 
				-		unsigned val;
			
 
				+		long int val;
			
 
				 		char *check;
			
 
				 
			
 
				-		val = (int)strtol(strval, &check, 10);
			
 
				+		val = strtol(strval, &check, 10);
			
 
				 		if (*check) {
			
 
				 			fprintf(stderr,"The %s environment variable must contain an integer\n", str);
			
 
				 			STARPU_ABORT();
			
 
				 		}
			
 
				 
			
 
				 		/* fprintf(stderr, "ENV %s WAS %d\n", str, val); */
			
 
				-		return val;
			
 
				+		return (int)val;
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -75,7 +75,7 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
				 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
			
 
				 endif
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
			
 
				 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -72,10 +72,9 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
 
				 	starpu_data_handle_t **data_handles;
			
 
				-	int x, y;
			
 
				+	unsigned x,y,i,j,k;
			
 
				 
			
 
				 	/* create all the DAG nodes */
			
 
				-	unsigned i,j,k;
			
 
				 
			
 
				 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
			
 
				 	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
			
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -25,8 +25,8 @@ unsigned nblocks = 16;
 
				 unsigned nbigblocks = 2;
			
 
				 unsigned noprio = 0;
			
 
				 unsigned display = 0;
			
 
				-unsigned dblockx = -1;
			
 
				-unsigned dblocky = -1;
			
 
				+int dblockx = -1;
			
 
				+int dblocky = -1;
			
 
				 
			
 
				 void parse_args(int argc, char **argv, int nodes)
			
 
				 {
			
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -32,8 +32,8 @@
 
				 static unsigned long size = 4096;
			
 
				 static unsigned nblocks = 16;
			
 
				 static unsigned check = 0;
			
 
				-static unsigned p = 1;
			
 
				-static unsigned q = 1;
			
 
				+static int p = 1;
			
 
				+static int q = 1;
			
 
				 static unsigned display = 0;
			
 
				 
			
 
				 #ifdef STARPU_HAVE_LIBNUMA
			
--- a/mpi/examples/mpi_lu/pxlu.c
+++ b/mpi/examples/mpi_lu/pxlu.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -101,7 +101,7 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 
				 	int mpi_tag_array[world_size];
			
 
				 	starpu_data_handle_t handle_array[world_size];
			
 
				 
			
 
				-	unsigned r;
			
 
				+	int r;
			
 
				 	for (r = 0; r < world_size; r++)
			
 
				 	{
			
 
				 		if (rank_mask[r]) {
			
--- a/mpi/src/Makefile.am
+++ b/mpi/src/Makefile.am
@@ -21,7 +21,7 @@ BUILT_SOURCES =
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
			
 
				 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -443,12 +443,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 
				 	_STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
			
 
				 
			
 
				 	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
			
 
				-	
			
 
				+
			
 
				 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
			
 
				 	STARPU_ASSERT(req->ret == MPI_SUCCESS);
			
 
				 
			
 
				 	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
			
 
				-	
			
 
				+
			
 
				 	if (*testing_req->flag)
			
 
				 	{
			
 
				 		testing_req->ret = req->ret;
			
@@ -841,6 +841,17 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
			
 
				 	}
			
 
				 
			
 
				+	{
			
 
				+	     int rank, worldsize;
			
 
				+	     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				+	     TRACE_MPI_START(rank, worldsize);
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+	     starpu_set_profiling_id(rank);
			
 
				+#endif //STARPU_USE_FXT
			
 
				+	}
			
 
				+
			
 
				+
			
 
				 	/* notify the main thread that the progression thread is ready */
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 	running = 1;
			
@@ -862,7 +873,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 			_STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
			
 
				 
			
 
				 			TRACE_MPI_SLEEP_BEGIN();
			
 
				-			
			
 
				+
			
 
				 			if (barrier_running)
			
 
				 				/* Tell mpi_barrier */
			
 
				 				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
			
@@ -967,13 +978,6 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
				 	argc_argv->argc = argc;
			
 
				 	argc_argv->argv = argv;
			
 
				 
			
 
				-	int rank, worldsize;
			
 
				-
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
			
 
				-
			
 
				-	TRACE_MPI_START(rank,worldsize);
			
 
				-
			
 
				 	_STARPU_PTHREAD_CREATE("MPI progress", &progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
			
 
				 
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
@@ -981,10 +985,6 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
				 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
			
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-	starpu_set_profiling_id(rank);
			
 
				-#endif //STARPU_USE_FXT
			
 
				-
			
 
				 #ifdef USE_STARPU_ACTIVITY
			
 
				 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
			
 
				 	STARPU_ASSERT(hookid >= 0);
			
@@ -1053,4 +1053,3 @@ int starpu_mpi_shutdown(void)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				-
			
--- a/mpi/src/starpu_mpi_stats.c
+++ b/mpi/src/starpu_mpi_stats.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -70,7 +70,7 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
 
				 
			
 
				 void _starpu_mpi_comm_amounts_display(int node)
			
 
				 {
			
 
				-	unsigned dst;
			
 
				+	int dst;
			
 
				 	size_t sum = 0;
			
 
				 
			
 
				 	if (stats_enabled == 0) return;
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -64,7 +64,7 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
				 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
			
 
				 endif
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
			
 
				 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
			
--- a/mpi/tests/mpi_detached_tag.c
+++ b/mpi/tests/mpi_detached_tag.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
--- a/mpi/tests/mpi_irecv.c
+++ b/mpi/tests/mpi_irecv.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
--- a/mpi/tests/mpi_irecv_detached.c
+++ b/mpi/tests/mpi_irecv_detached.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -69,8 +69,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
--- a/mpi/tests/mpi_isend.c
+++ b/mpi/tests/mpi_isend.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
--- a/mpi/tests/mpi_isend_detached.c
+++ b/mpi/tests/mpi_isend_detached.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -68,8 +68,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
--- a/mpi/tests/mpi_probe.c
+++ b/mpi/tests/mpi_probe.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -69,8 +69,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
--- a/mpi/tests/mpi_test.c
+++ b/mpi/tests/mpi_test.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -55,8 +55,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
--- a/mpi/tests/pingpong.c
+++ b/mpi/tests/pingpong.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -55,8 +55,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
--- a/mpi/tests/ring.c
+++ b/mpi/tests/ring.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,7 +24,7 @@
 
				 #  define NITER	2048
			
 
				 #endif
			
 
				 
			
 
				-unsigned token = 42;
			
 
				+int token = 42;
			
 
				 starpu_data_handle_t token_handle;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
				 
			
 
				 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	(*tokenptr)++;
			
 
				 }
			
 
				 
			
@@ -82,13 +82,13 @@ int main(int argc, char **argv)
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
			
 
				+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 
			
 
				-	unsigned last_loop = nloops - 1;
			
 
				-	unsigned last_rank = size - 1;
			
 
				+	int last_loop = nloops - 1;
			
 
				+	int last_rank = size - 1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
 
				 	{
			
--- a/mpi/tests/ring_async.c
+++ b/mpi/tests/ring_async.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,7 +24,7 @@
 
				 #  define NITER	2048
			
 
				 #endif
			
 
				 
			
 
				-unsigned token = 42;
			
 
				+int token = 42;
			
 
				 starpu_data_handle_t token_handle;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
				 
			
 
				 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	(*tokenptr)++;
			
 
				 }
			
 
				 
			
@@ -82,13 +82,13 @@ int main(int argc, char **argv)
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
			
 
				+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 
			
 
				-	unsigned last_loop = nloops - 1;
			
 
				-	unsigned last_rank = size - 1;
			
 
				+	int last_loop = nloops - 1;
			
 
				+	int last_rank = size - 1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
 
				 	{
			
--- a/mpi/tests/ring_async_implicit.c
+++ b/mpi/tests/ring_async_implicit.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,7 +24,7 @@
 
				 #  define NITER	2048
			
 
				 #endif
			
 
				 
			
 
				-unsigned token = 42;
			
 
				+int token = 42;
			
 
				 starpu_data_handle_t token_handle;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
				 
			
 
				 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	(*tokenptr)++;
			
 
				 }
			
 
				 
			
@@ -80,13 +80,13 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 
			
 
				-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
			
 
				+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
			
 
				 
			
 
				-	unsigned nloops = NITER;
			
 
				-	unsigned loop;
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				 
			
 
				-	unsigned last_loop = nloops - 1;
			
 
				-	unsigned last_rank = size - 1;
			
 
				+	int last_loop = nloops - 1;
			
 
				+	int last_rank = size - 1;
			
 
				 
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
 
				 	{
			
--- a/sched_ctx_hypervisor/examples/Makefile.am
+++ b/sched_ctx_hypervisor/examples/Makefile.am
@@ -13,9 +13,9 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include -I$(top_srcdir)/sched_ctx_hypervisor/examples
			
 
				 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS)
			
 
				 
			
 
				 if !NO_BLAS_LIB
			
--- a/sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
+++ b/sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
@@ -48,7 +48,7 @@ int tag = 1;
 
				 void* start_thread(void *arg)
			
 
				 {
			
 
				 	unsigned sched_ctx = *((unsigned*)arg);
			
 
				-	starpu_task_set_context(&sched_ctx);
			
 
				+	starpu_sched_ctx_set_context(&sched_ctx);
			
 
				 
			
 
				 	struct starpu_task *task[10];
			
 
				 	struct params params[10];
			
@@ -115,8 +115,8 @@ int main()
 
				 	policy.name = "app_driven";
			
 
				 	void *perf_counters = sched_ctx_hypervisor_init(&policy);
			
 
				 
			
 
				-	starpu_set_perf_counters(sched_ctx1, (struct starpu_performance_counters*)perf_counters);
			
 
				-	starpu_set_perf_counters(sched_ctx2, (struct starpu_performance_counters*)perf_counters);
			
 
				+	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
			
 
				+	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
			
 
				 	sched_ctx_hypervisor_register_ctx(sched_ctx1, 0.0);
			
 
				 	sched_ctx_hypervisor_register_ctx(sched_ctx2, 0.0);
			
 
				 
			
--- a/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
+++ b/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
@@ -97,7 +97,7 @@ void* start_bench(void *val)
 
				 	pthread_setspecific(key, &p->id);
			
 
				 
			
 
				 	if(p->ctx != 0)
			
 
				-		starpu_task_set_context(&p->ctx);
			
 
				+		starpu_sched_ctx_set_context(&p->ctx);
			
 
				 
			
 
				 	for(i = 0; i < NSAMPLES; i++)
			
 
				 		p->bench(p->mat[i], p->size, p->nblocks);
			
@@ -241,7 +241,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 
				 	struct sched_ctx_hypervisor_policy policy;
			
 
				 	policy.custom = 0;
			
 
				 	policy.name = "idle";
			
 
				-	struct starpu_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
			
 
				+	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
			
 
				 	int nworkers1 = cpu1 + gpu + gpu1;
			
 
				 	int nworkers2 = cpu2 + gpu + gpu2;
			
 
				 	unsigned n_all_gpus = gpu + gpu1 + gpu2;
			
@@ -267,7 +267,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 
				 		p1.workers[i] = i;
			
 
				 
			
 
				 	p1.ctx = starpu_sched_ctx_create("heft", p1.workers, nworkers1, "sched_ctx1");
			
 
				-	starpu_set_perf_counters(p1.ctx, perf_counters);
			
 
				+	starpu_sched_ctx_set_perf_counters(p1.ctx, perf_counters);
			
 
				 	p2.the_other_ctx = (int)p1.ctx;
			
 
				 	p1.nworkers = nworkers1;
			
 
				 	sched_ctx_hypervisor_register_ctx(p1.ctx, 0.0);
			
@@ -303,7 +303,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 
				 	/* 	p2.workers[k++] = i; */
			
 
				 
			
 
				 	p2.ctx = starpu_sched_ctx_create("heft", p2.workers, 0, "sched_ctx2");
			
 
				-	starpu_set_perf_counters(p2.ctx, perf_counters);
			
 
				+	starpu_sched_ctx_set_perf_counters(p2.ctx, perf_counters);
			
 
				 	p1.the_other_ctx = (int)p2.ctx;
			
 
				 	p2.nworkers = 0;
			
 
				 	sched_ctx_hypervisor_register_ctx(p2.ctx, 0.0);
			
--- a/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
+++ b/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
@@ -30,3 +30,4 @@ void end_contexts(void);
 
				 void start_2benchs(void (*bench)(float *mat, unsigned size, unsigned nblocks));
			
 
				 void start_1stbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
			
 
				 void start_2ndbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
			
 
				+void set_hypervisor_conf(int event, int task_tag);
			
--- a/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
+++ b/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
@@ -127,6 +127,12 @@ struct sched_ctx_hypervisor_wrapper
 
				 	/* number of flops executed since last resizing */
			
 
				 	double elapsed_flops[STARPU_NMAXWORKERS];
			
 
				 
			
 
				+	/* data quantity executed on each worker in this ctx */
			
 
				+	size_t elapsed_data[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* nr of tasks executed on each worker in this ctx */
			
 
				+	int elapsed_tasks[STARPU_NMAXWORKERS];
			
 
				+
			
 
				 	/* the average speed of workers when they belonged to this context */
			
 
				 	double ref_velocity[STARPU_NMAXWORKERS];
			
 
				 
			
@@ -168,7 +174,7 @@ struct sched_ctx_hypervisor_policy
 
				 	void (*end_ctx)(unsigned sched_ctx);
			
 
				 };
			
 
				 
			
 
				-struct starpu_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
			
 
				+struct starpu_sched_ctx_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
			
 
				 
			
 
				 void sched_ctx_hypervisor_shutdown(void);
			
 
				 
			
--- a/sched_ctx_hypervisor/src/Makefile.am
+++ b/sched_ctx_hypervisor/src/Makefile.am
@@ -12,11 +12,9 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				 LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
			
 
				-
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include/starpu/$(STARPU_EFFECTIVE_VERSION)/ -I$(top_builddir)/src/ -I$(top_srcdir)/src/ -I$(top_srcdir)/sched_ctx_hypervisor/include/
			
 
				-
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src/ -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include/ -I$(top_srcdir)/sched_ctx_hypervisor/src
			
 
				 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
			
 
				 
			
 
				 lib_LTLIBRARIES = libsched_ctx_hypervisor.la
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
@@ -32,10 +32,12 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
			
 
				 	
			
 
				 	int w,s;
			
 
				-	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
			
 
				 
			
 
				+	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
			
 
				+	double total_flops = 0.0;
			
 
				 	for(s = 0; s < ns; s++)
			
 
				 	{
			
 
				+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				 		for(w = 0; w < nw; w++)
			
 
				 		{
			
 
				 			w_in_s[s][w] = 0.0;
			
@@ -44,7 +46,6 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 			draft_flops_on_w[s][w] = 0.0;
			
 
				 			int worker = workers == NULL ? w : workers[w];
			
 
				 
			
 
				-			sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				 			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
			
 
				 			if(velocity[s][w] == -1.0)
			
 
				 			{
			
@@ -53,21 +54,20 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 				if(velocity[s][w] == -1.0)
			
 
				 					velocity[s][w] = sc_w->ref_velocity[worker];
			
 
				 				if(velocity[s][w] == -1.0)
			
 
				-					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
			
 
				+					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
			
 
				 			}
			
 
				 			
			
 
				-			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
 
				+//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
 
				 		}
			
 
				 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
			
 
				 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
			
 
				 	}
			
 
				-
			
 
				-
			
 
				+	
			
 
				 	/* take the exec time of the slowest ctx 
			
 
				 	   as starting point and then try to minimize it
			
 
				 	   as increasing it a little for the faster ctxs */
			
 
				 	double tmax = _get_slowest_ctx_exec_time();
			
 
				-	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax;
			
 
				+ 	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; 
			
 
				 //	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
			
 
				 
			
 
				 	double res = 1.0;
			
@@ -413,8 +413,8 @@ static void ispeed_lp_end_ctx(unsigned sched_ctx)
 
				 {
			
 
				 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
			
 
				 	int worker;
			
 
				-	for(worker = 0; worker < 12; worker++)
			
 
				-		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]);
			
 
				+/* 	for(worker = 0; worker < 12; worker++) */
			
 
				+/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
			
 
				 
			
 
				 	return;
			
 
				 }
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
@@ -77,7 +77,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 
				 	int worker;
			
 
				 	int considered = 0;
			
 
				 
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
@@ -283,6 +283,12 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 		{
			
 
				 			tmp_nw_move[w] = 0;
			
 
				 			tmp_nw_add[w] = 0;
			
 
				+			int i;
			
 
				+			for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+			{
			
 
				+				tmp_workers_move[w][i] = -1;
			
 
				+				tmp_workers_add[w][i] = -1;
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		/* find workers that ctx s has to give away */
			
@@ -363,6 +369,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 				int nw_add = 0;
			
 
				 
			
 
				 				int w;
			
 
				+				int j = 0, k = 0;
			
 
				 				for(w = 0; w < nw; w++)
			
 
				 				{
			
 
				 					enum starpu_archtype arch = STARPU_ANY_WORKER;
			
@@ -375,7 +382,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 					if( nw_needed > 0 && tmp_nw_move[w] > 0)
			
 
				 					{
			
 
				 						nw_move += nw_needed >= tmp_nw_move[w] ? tmp_nw_move[w] : nw_needed;
			
 
				-						int i = 0, j = 0;
			
 
				+						int i = 0;
			
 
				 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				 						{
			
 
				 							if(tmp_workers_move[w][i] != -1)
			
@@ -395,14 +402,14 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 					if(diff > 0.3 && tmp_nw_add[w] != 0)
			
 
				 					{
			
 
				 						nw_add = tmp_nw_add[w];
			
 
				-						int i = 0, j = 0;
			
 
				+						int i = 0;
			
 
				 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				 						{
			
 
				 							if(tmp_workers_add[w][i] != -1)
			
 
				 							{
			
 
				-								workers_add[j++] = tmp_workers_add[w][i];
			
 
				+								workers_add[k++] = tmp_workers_add[w][i];
			
 
				 								tmp_workers_add[w][i] = -1;
			
 
				-								if(j == nw_add)
			
 
				+								if(k == nw_add)
			
 
				 									break;
			
 
				 							}
			
 
				 						}
			
@@ -413,7 +420,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 				
			
 
				 				if(nw_move > 0)
			
 
				 				{
			
 
				-					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 1);
			
 
				+					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
			
 
				 					nw_move = 0;
			
 
				 				}
			
 
				 
			
@@ -452,7 +459,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 		}
			
 
				 
			
 
				 		if(nw_move > 0)
			
 
				-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 1);
			
 
				+			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
@@ -28,7 +28,7 @@ static int _compute_priority(unsigned sched_ctx)
 
				 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
			
 
				 	int worker;
			
 
				 
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
@@ -113,7 +113,7 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 
				 	int worker;
			
 
				 	int considered = 0;
			
 
				 
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
@@ -169,7 +169,6 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-
			
 
				 	return curr_workers;
			
 
				 }
			
 
				 
			
@@ -181,7 +180,7 @@ unsigned _get_potential_nworkers(struct sched_ctx_hypervisor_policy_config *conf
 
				 	unsigned potential_workers = 0;
			
 
				 	int worker;
			
 
				 
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 	while(workers->has_next(workers, &it))
			
@@ -304,7 +303,7 @@ static double _get_best_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w,
 
				 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				         int worker;
			
 
				 
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				                 workers->init_iterator(workers, &it);
			
 
				 
			
@@ -330,7 +329,7 @@ static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_
 
				 
			
 
				 	double avg = 0.0;
			
 
				 	int n = 0;
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				                 workers->init_iterator(workers, &it);
			
 
				 
			
@@ -356,7 +355,7 @@ static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
 
				         
			
 
				 	int worker;
			
 
				 	double ispeed_sample = 0.0;
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 
			
 
				 	if(workers->init_iterator)
			
 
				                 workers->init_iterator(workers, &it);
			
@@ -401,7 +400,7 @@ double _get_slowest_ctx_exec_time(void)
 
				 	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
			
 
				 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
			
 
				 
			
 
				-	double curr_time = starpu_timing_now();
			
 
				+/* 	double curr_time = starpu_timing_now(); */
			
 
				 	double slowest_time = 0.0;
			
 
				 
			
 
				 	int s;
			
@@ -410,18 +409,13 @@ double _get_slowest_ctx_exec_time(void)
 
				 	{
			
 
				 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				 
			
 
				-/*                 double elapsed_time = curr_time - sc_w->start_time; */
			
 
				-/* 		if(elapsed_time > slowest_time) */
			
 
				-/* 			slowest_time = elapsed_time; */
			
 
				-
			
 
				-//		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
			
 
				+//		double elapsed_time  = (curr_time - sc_w->start_time)/1000000;
			
 
				 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
			
 
				-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
			
 
				+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
			
 
				 		if(elapsed_time > slowest_time)
			
 
				 			slowest_time = elapsed_time;
			
 
				 
			
 
				         }
			
 
				-//	return slowest_time / 1000000.0;
			
 
				 	return slowest_time;
			
 
				 }
			
 
				 
			
@@ -431,7 +425,7 @@ double _get_fastest_ctx_exec_time(void)
 
				 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
			
 
				 
			
 
				 	double curr_time = starpu_timing_now();
			
 
				-	double fastest_time = curr_time;
			
 
				+ 	double fastest_time = curr_time;
			
 
				 
			
 
				 	int s;
			
 
				 	struct sched_ctx_hypervisor_wrapper* sc_w;		
			
@@ -440,13 +434,13 @@ double _get_fastest_ctx_exec_time(void)
 
				 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				 
			
 
				 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
			
 
				-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
			
 
				-
			
 
				+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
			
 
				+		
			
 
				 		if(elapsed_time < fastest_time)
			
 
				 			fastest_time = elapsed_time;
			
 
				 
			
 
				         }
			
 
				-//	return fastest_time / 1000000.0;
			
 
				+
			
 
				 	return fastest_time;
			
 
				 }
			
 
				 
			
@@ -457,6 +451,8 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
				 		return -1.0;
			
 
				 
			
 
				         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
			
 
				+	size_t elapsed_data_used = sc_w->elapsed_data[worker];
			
 
				+	int elapsed_tasks = sc_w->elapsed_tasks[worker];
			
 
				 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
			
 
				 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
			
 
				 
			
@@ -479,6 +475,17 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
				         {
			
 
				                 double curr_time = starpu_timing_now();
			
 
				                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				+ 		enum starpu_archtype arch = starpu_worker_get_type(worker);
			
 
				+		if(arch == STARPU_CUDA_WORKER)
			
 
				+		{
			
 
				+			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
			
 
				+			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ;
			
 
				+			double latency = starpu_get_latency_RAM_CUDA(worker);
			
 
				+//			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks);
			
 
				+			elapsed_time += (elapsed_tasks * latency)/1000000;
			
 
				+//			printf("elapsed time after %lf \n", elapsed_time);
			
 
				+		}
			
 
				+			
			
 
				                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
			
 
				 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
			
 
				                 return vel;
			
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
@@ -19,11 +19,11 @@
 
				 #include <starpu_config.h>
			
 
				 
			
 
				 unsigned imposed_resize = 0;
			
 
				-struct starpu_performance_counters* perf_counters = NULL;
			
 
				+struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
			
 
				 
			
 
				 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
			
 
				 static void notify_pushed_task(unsigned sched_ctx, int worker);
			
 
				-static void notify_poped_task(unsigned sched_ctx, int worker, double flops);
			
 
				+static void notify_poped_task(unsigned sched_ctx, int worker, double flops, size_t data_size);
			
 
				 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
			
 
				 static void notify_idle_end(unsigned sched_ctx, int  worker);
			
 
				 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
			
@@ -125,7 +125,7 @@ static struct sched_ctx_hypervisor_policy *_select_hypervisor_policy(struct sche
 
				 
			
 
				 
			
 
				 /* initializez the performance counters that starpu will use to retrive hints for resizing */
			
 
				-struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
			
 
				+struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
			
 
				 {
			
 
				 	hypervisor.min_tasks = 0;
			
 
				 	hypervisor.nsched_ctxs = 0;
			
@@ -158,6 +158,8 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
				 			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
			
 
				+			hypervisor.sched_ctx_w[i].elapsed_data[j] = 0;
			
 
				+			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
			
 
				 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
			
@@ -167,7 +169,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
				 	struct sched_ctx_hypervisor_policy *selected_hypervisor_policy = _select_hypervisor_policy(hypervisor_policy);
			
 
				 	_load_hypervisor_policy(selected_hypervisor_policy);
			
 
				 
			
 
				-	perf_counters = (struct starpu_performance_counters*)malloc(sizeof(struct starpu_performance_counters));
			
 
				+	perf_counters = (struct starpu_sched_ctx_performance_counters*)malloc(sizeof(struct starpu_sched_ctx_performance_counters));
			
 
				 	perf_counters->notify_idle_cycle = notify_idle_cycle;
			
 
				 	perf_counters->notify_pushed_task = notify_pushed_task;
			
 
				 	perf_counters->notify_poped_task = notify_poped_task;
			
@@ -175,7 +177,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
				 	perf_counters->notify_idle_end = notify_idle_end;
			
 
				 	perf_counters->notify_submitted_job = notify_submitted_job;
			
 
				 
			
 
				-	starpu_notify_hypervisor_exists();
			
 
				+	starpu_sched_ctx_notify_hypervisor_exists();
			
 
				 
			
 
				 	return perf_counters;
			
 
				 }
			
@@ -346,7 +348,7 @@ int sched_ctx_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archty
 
				 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
			
 
				 	int worker;
			
 
				 
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
@@ -364,7 +366,14 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 
				 {
			
 
				 	int i;
			
 
				 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+	{
			
 
				 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
			
 
				+		if(val == 0)
			
 
				+		{
			
 
				+			hypervisor.sched_ctx_w[sched_ctx].elapsed_data[i] = 0;
			
 
				+			hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[i] = 0;
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
			
@@ -396,7 +405,7 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 
				 	sender_sc_w->start_time = start_time;
			
 
				 	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
			
 
				 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
			
 
				-	
			
 
				+
			
 
				 	receiver_sc_w->start_time = start_time;
			
 
				 	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
			
 
				 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
			
@@ -410,19 +419,11 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
				 	{
			
 
				 		_print_current_time();
			
 
				 		int j;
			
 
				-		printf("resize ctx %d with", sender_sched_ctx);
			
 
				+		printf("resize ctx %d with %d workers", sender_sched_ctx, nworkers_to_move);
			
 
				 		for(j = 0; j < nworkers_to_move; j++)
			
 
				 			printf(" %d", workers_to_move[j]);
			
 
				 		printf("\n");
			
 
				 
			
 
				-/* 		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int)); */
			
 
				-/* 		int ncpus; */
			
 
				-
			
 
				-/* 		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus); */
			
 
				-
			
 
				-/* //		if(ncpus != 0) */
			
 
				-/* //			starpu_sched_ctx_remove_workers(cpus, ncpus, sender_sched_ctx); */
			
 
				-
			
 
				 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
			
 
				 
			
 
				 		if(now)
			
@@ -622,11 +623,11 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
				 			   whatever the application says */
			
 
				 			if(!((hypervisor.resize[sender_sched_ctx] == 0 || hypervisor.resize[receiver_sched_ctx] == 0) && imposed_resize))
			
 
				 			{
			
 
				-				int j;
			
 
				-				printf("remove after ack from ctx %d:", sender_sched_ctx);
			
 
				-				for(j = 0; j < nmoved_workers; j++)
			
 
				-					printf(" %d", moved_workers[j]);
			
 
				-				printf("\n");
			
 
				+/* 				int j; */
			
 
				+/* 				printf("remove after ack from ctx %d:", sender_sched_ctx); */
			
 
				+/* 				for(j = 0; j < nmoved_workers; j++) */
			
 
				+/* 					printf(" %d", moved_workers[j]); */
			
 
				+/* 				printf("\n"); */
			
 
				 
			
 
				 				starpu_sched_ctx_remove_workers(moved_workers, nmoved_workers, sender_sched_ctx);
			
 
				 
			
@@ -715,10 +716,12 @@ static void notify_pushed_task(unsigned sched_ctx, int worker)
 
				 }
			
 
				 
			
 
				 /* notifies the hypervisor that a task was poped from the queue of the worker */
			
 
				-static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops)
			
 
				+static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops, size_t data_size)
			
 
				 {
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
			
 
				 
			
--- a/socl/examples/Makefile.am
+++ b/socl/examples/Makefile.am
@@ -14,7 +14,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				 LIBS = $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
			
 
				 
			
--- a/socl/src/cl_enqueuendrangekernel.c
+++ b/socl/src/cl_enqueuendrangekernel.c
@@ -164,9 +164,6 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 
				       cl_event beforeEvent, afterEvent, totalEvent;
			
 
				 
			
 
				       totalEvent = event_create();
			
 
				-      totalEvent->prof_start = _socl_nanotime();
			
 
				-      totalEvent->prof_submit = totalEvent->prof_start;
			
 
				-      totalEvent->prof_queued = totalEvent->prof_start;
			
 
				       gc_entity_store(&totalEvent->cq, cq);
			
 
				 
			
 
				       command_marker cmd = command_marker_create();
			
@@ -197,7 +194,7 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 
				          /* Store perf */
			
 
				          cl_ulong start,end;
			
 
				          soclGetEventProfilingInfo(beforeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL);
			
 
				-         soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL);
			
 
				+         soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
			
 
				          soclReleaseEvent(afterEvent);
			
 
				 
			
 
				          kernel->split_perfs[iter] = end-start;
			
@@ -205,6 +202,12 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 
				          pthread_mutex_unlock(&kernel->split_lock);
			
 
				 
			
 
				          event_complete(totalEvent);
			
 
				+
			
 
				+         totalEvent->prof_start = start;
			
 
				+         totalEvent->prof_submit = start;
			
 
				+         totalEvent->prof_queued = start;
			
 
				+         totalEvent->prof_end = end;
			
 
				+
			
 
				          RETURN_EVENT(totalEvent,event);
			
 
				       } else {
			
 
				          soclReleaseEvent(totalEvent);
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -49,7 +49,7 @@ endif STARPU_HAVE_WINDOWS
 
				 
			
 
				 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 
			
 
				-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU -Werror=implicit
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU
			
 
				 
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS)
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -144,6 +144,15 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 
			
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				 
			
 
				+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	int i;
			
 
				+	size_t data_size = 0;
			
 
				+	for(i = 0; i < STARPU_NMAXBUFS; i++)
			
 
				+		if(task->handles[i] != NULL)
			
 
				+			data_size += _starpu_data_get_size(task->handles[i]);
			
 
				+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+
			
 
				 	/* We release handle reference count */
			
 
				 	if (task->cl)
			
 
				 	{
			
@@ -210,8 +219,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 	{
			
 
				 		_starpu_sched_post_exec_hook(task);
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				-		int workerid = starpu_worker_get_id();
			
 
				-		starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops);
			
 
				+		starpu_sched_ctx_call_poped_task_cb(workerid, task->sched_ctx, task->flops, data_size);
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 	}
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1344,6 +1344,16 @@ static void write_bus_bandwidth_file_content(void)
 
				 }
			
 
				 #endif /* STARPU_SIMGRID */
			
 
				 
			
 
				+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
			
 
				+{
			
 
				+	return bandwidth_matrix[0][cudadev+1];
			
 
				+}
			
 
				+
			
 
				+double starpu_get_latency_RAM_CUDA(unsigned cudadev)
			
 
				+{
			
 
				+	return latency_matrix[0][cudadev+1];
			
 
				+}
			
 
				+
			
 
				 void starpu_bus_print_bandwidth(FILE *f)
			
 
				 {
			
 
				 	unsigned src, dst, maxnode;
			
@@ -1397,14 +1407,14 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 	{
			
 
				 		struct dev_timing *timing;
			
 
				 		struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				-		int ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				-		int cpu;
			
 
				+		unsigned config_ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+		unsigned cpu;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		if (src <= ncuda)
			
 
				 		{
			
 
				 			fprintf(f, "CUDA %d\t", src-1);
			
 
				-			for (cpu = 0; cpu < ncpus; cpu++)
			
 
				+			for (cpu = 0; cpu < config_ncpus; cpu++)
			
 
				 			{
			
 
				 				timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
			
 
				 				if (timing->timing_htod)
			
@@ -1420,7 +1430,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 		{
			
 
				 			fprintf(f, "OpenCL%d\t", src-ncuda-1);
			
 
				-			for (cpu = 0; cpu < ncpus; cpu++)
			
 
				+			for (cpu = 0; cpu < config_ncpus; cpu++)
			
 
				 			{
			
 
				 				timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
			
 
				 				if (timing->timing_htod)
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -180,7 +180,7 @@ static void scan_reg_model(FILE *f, struct starpu_perfmodel_regression_model *re
 
				 
			
 
				 static void dump_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
			
 
				 {
			
 
				-	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
			
 
				+	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
			
 
				 }
			
 
				 
			
 
				 static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
			
@@ -192,28 +192,36 @@ static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *e
 
				 	/* In case entry is NULL, we just drop these values */
			
 
				 	unsigned nsample;
			
 
				 	uint32_t footprint;
			
 
				-#ifdef STARPU_HAVE_WINDOWS
			
 
				-	unsigned size; /* in bytes */
			
 
				-#else
			
 
				-	size_t size; /* in bytes */
			
 
				-#endif
			
 
				+	unsigned long size; /* in bytes */
			
 
				+	double flops;
			
 
				 	double mean;
			
 
				 	double deviation;
			
 
				 	double sum;
			
 
				 	double sum2;
			
 
				 
			
 
				+	char line[256];
			
 
				+	char *ret;
			
 
				+
			
 
				+	ret = fgets(line, sizeof(line), f);
			
 
				+	STARPU_ASSERT(ret);
			
 
				+	STARPU_ASSERT(strchr(line, '\n'));
			
 
				+
			
 
				 	/* Read the values from the file */
			
 
				-	res = fscanf(f, "%x\t%"
			
 
				-#ifndef STARPU_HAVE_WINDOWS
			
 
				-	"z"
			
 
				-#endif
			
 
				-	"u\t%le\t%le\t%le\t%le\t%u\n", &footprint, &size, &mean, &deviation, &sum, &sum2, &nsample);
			
 
				-	STARPU_ASSERT_MSG(res == 7, "Incorrect performance model file");
			
 
				+	res = sscanf(line, "%x\t%lu\t%le\t%le\t%le\t%le\t%le\t%u", &footprint, &size, &flops, &mean, &deviation, &sum, &sum2, &nsample);
			
 
				+
			
 
				+	if (res != 8)
			
 
				+	{
			
 
				+		flops = 0.;
			
 
				+		/* Read the values from the file */
			
 
				+		res = sscanf(line, "%x\t%lu\t%le\t%le\t%le\t%le\t%u", &footprint, &size, &mean, &deviation, &sum, &sum2, &nsample);
			
 
				+		STARPU_ASSERT_MSG(res == 7, "Incorrect performance model file");
			
 
				+	}
			
 
				 
			
 
				 	if (entry)
			
 
				 	{
			
 
				 		entry->footprint = footprint;
			
 
				 		entry->size = size;
			
 
				+		entry->flops = flops;
			
 
				 		entry->mean = mean;
			
 
				 		entry->deviation = deviation;
			
 
				 		entry->sum = sum;
			
@@ -393,7 +401,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 
				 	/* Dump the history into the model file in case it is necessary */
			
 
				 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				 	{
			
 
				-		fprintf(f, "# hash\t\tsize\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
			
 
				+		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
			
 
				 		ptr = per_arch_model->list;
			
 
				 		while (ptr)
			
 
				 		{
			
@@ -956,7 +964,7 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 
				 			char *symbol2 = strdup(symbol);
			
 
				 			symbol2[dot-symbol] = '\0';
			
 
				 			int ret;
			
 
				-			fprintf(stderr,"note: loading history from %s instead of %s\n", symbol2, symbol);
			
 
				+			_STARPU_DISP("note: loading history from %s instead of %s\n", symbol2, symbol);
			
 
				 			ret = starpu_perfmodel_load_symbol(symbol2,model);
			
 
				 			free(symbol2);
			
 
				 			return ret;
			
@@ -1152,6 +1160,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 				entry->sum2 = measured*measured;
			
 
				 
			
 
				 				entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
			
 
				+				entry->flops = j->task->flops;
			
 
				 
			
 
				 				entry->footprint = key;
			
 
				 				entry->nsample = 1;
			
@@ -1168,6 +1177,14 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 				unsigned n = entry->nsample;
			
 
				 				entry->mean = entry->sum / n;
			
 
				 				entry->deviation = sqrt((entry->sum2 - (entry->sum*entry->sum)/n)/n);
			
 
				+				if (j->task->flops != 0.)
			
 
				+				{
			
 
				+					if (entry->flops == 0.)
			
 
				+						entry->flops = j->task->flops;
			
 
				+					else if (entry->flops != j->task->flops)
			
 
				+						/* Incoherent flops! forget about trying to record flops */
			
 
				+						entry->flops = NAN;
			
 
				+				}
			
 
				 			}
			
 
				 
			
 
				 			STARPU_ASSERT(entry);
			
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011, 2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
@@ -38,8 +38,8 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
				 			if (!parameter)
			
 
				 			{
			
 
				 				/* There isn't a parameter that is explicitely requested, so we display all parameters */
			
 
				-				printf("%08x\t%-15lu\t%-15le\t%-15le\t%u\n", entry->footprint,
			
 
				-					(unsigned long) entry->size, entry->mean, entry->deviation, entry->nsample);
			
 
				+				printf("%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint,
			
 
				+					(unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->nsample);
			
 
				 			}
			
 
				 			else
			
 
				 			{
			
@@ -230,7 +230,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
			
 
				 		if (nmatched == 1)
			
 
				 		{
			
 
				-			unsigned archid = STARPU_CUDA_DEFAULT+ gpuid;
			
 
				+			int archid = STARPU_CUDA_DEFAULT+ gpuid;
			
 
				 			unsigned implid;
			
 
				 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				 				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -40,9 +40,9 @@ static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_w
 
				 	unsigned worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
			
 
				 	/* the worker was planning to go away in another ctx but finally he changed his mind & 
			
 
				 	   he's staying */
			
 
				-	if(worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
			
 
				+	if (worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
			
 
				 	{
			
 
				-		unsigned worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
			
 
				+		worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
			
 
				 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 		/* add context to worker */
			
 
				 		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
			
@@ -124,7 +124,7 @@ static void _starpu_update_workers_without_ctx(int *workerids, int nworkers, int
 
				 	return;
			
 
				 }
			
 
				 
			
 
				-void starpu_stop_task_submission()
			
 
				+void starpu_sched_ctx_stop_task_submission()
			
 
				 {
			
 
				 	_starpu_exclude_task_from_dag(&stop_submission_task);
			
 
				 	_starpu_task_submit_internally(&stop_submission_task);
			
@@ -442,7 +442,7 @@ unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids,
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				-void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters)
			
 
				+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	sched_ctx->perf_counters = perf_counters;
			
@@ -716,6 +716,8 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier);
			
 
				+/*when finished decrementing the tasks if the user signaled he will not submit tasks anymore
			
 
				+  we can move all its workers to the inheritor context */
			
 
				 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
			
 
				 	{
			
 
				 		_STARPU_PTHREAD_MUTEX_LOCK(&finished_submit_mutex);
			
@@ -723,17 +725,20 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
				 		{
			
 
				 			_STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
			
 
				 
			
 
				-			_STARPU_PTHREAD_MUTEX_LOCK(&changing_ctx_mutex[sched_ctx->id]);
			
 
				-			int *workerids = NULL;
			
 
				-			unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
			
 
				-
			
 
				-			if(nworkers > 0)
			
 
				+			/* take care the context is not deleted or changed at the same time */
			
 
				+			_STARPU_PTHREAD_MUTEX_LOCK(&changing_ctx_mutex[sched_ctx_id]);
			
 
				+			if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
			
 
				 			{
			
 
				-				starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
			
 
				-				free(workerids);
			
 
				+				int *workerids = NULL;
			
 
				+				unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
			
 
				+				
			
 
				+				if(nworkers > 0)
			
 
				+				{
			
 
				+					starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
			
 
				+					free(workerids);
			
 
				+				}
			
 
				 			}
			
 
				-
			
 
				-			_STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx->id]);
			
 
				+			_STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
			
 
				 
			
 
				 			return;
			
 
				 		}
			
@@ -748,12 +753,12 @@ void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
				 	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
			
 
				 }
			
 
				 
			
 
				-void starpu_task_set_context(unsigned *sched_ctx)
			
 
				+void starpu_sched_ctx_set_context(unsigned *sched_ctx)
			
 
				 {
			
 
				 	pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
			
 
				 }
			
 
				 
			
 
				-unsigned starpu_task_get_context()
			
 
				+unsigned starpu_sched_ctx_get_context()
			
 
				 {
			
 
				 	unsigned *sched_ctx = (unsigned*)pthread_getspecific(sched_ctx_key);
			
 
				 	if(sched_ctx == NULL)
			
@@ -762,12 +767,12 @@ unsigned starpu_task_get_context()
 
				 	return *sched_ctx;
			
 
				 }
			
 
				 
			
 
				-void starpu_notify_hypervisor_exists()
			
 
				+void starpu_sched_ctx_notify_hypervisor_exists()
			
 
				 {
			
 
				 	with_hypervisor = 1;
			
 
				 }
			
 
				 
			
 
				-unsigned starpu_check_if_hypervisor_exists()
			
 
				+unsigned starpu_sched_ctx_check_if_hypervisor_exists()
			
 
				 {
			
 
				 	return with_hypervisor;
			
 
				 }
			
@@ -797,7 +802,7 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collec
 
				 
			
 
				 	switch(worker_collection_type)
			
 
				 	{
			
 
				-	case STARPU_WORKER_LIST:
			
 
				+	case STARPU_SCHED_CTX_WORKER_LIST:
			
 
				 		sched_ctx->workers->has_next = worker_list.has_next;
			
 
				 		sched_ctx->workers->get_next = worker_list.get_next;
			
 
				 		sched_ctx->workers->add = worker_list.add;
			
@@ -805,7 +810,7 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collec
 
				 		sched_ctx->workers->init = worker_list.init;
			
 
				 		sched_ctx->workers->deinit = worker_list.deinit;
			
 
				 		sched_ctx->workers->init_iterator = worker_list.init_iterator;
			
 
				-		sched_ctx->workers->type = STARPU_WORKER_LIST;
			
 
				+		sched_ctx->workers->type = STARPU_SCHED_CTX_WORKER_LIST;
			
 
				 		break;
			
 
				 	}
			
 
				 
			
@@ -818,7 +823,7 @@ static unsigned _get_workers_list(struct _starpu_sched_ctx *sched_ctx, int **wor
 
				 	*workerids = (int*)malloc(workers->nworkers*sizeof(int));
			
 
				 	int worker;
			
 
				 	unsigned nworkers = 0;
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
@@ -851,7 +856,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 
				 	int worker;
			
 
				 
			
 
				 	int npus = 0;
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
@@ -866,7 +871,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 
				 	return npus;
			
 
				 }
			
 
				 
			
 
				-_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id)
			
 
				+_starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id)
			
 
				 {
			
 
				 	return &changing_ctx_mutex[sched_ctx_id];
			
 
				 }
			
@@ -891,7 +896,7 @@ unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sc
 
				         int worker, worker2;
			
 
				         int shared_workers = 0;
			
 
				 
			
 
				-	struct starpu_iterator it1, it2;
			
 
				+	struct starpu_sched_ctx_iterator it1, it2;
			
 
				         if(workers->init_iterator)
			
 
				                 workers->init_iterator(workers, &it1);
			
 
				 
			
@@ -926,7 +931,7 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id)
 
				         struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
			
 
				         int worker;
			
 
				 
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				         if(workers->init_iterator)
			
 
				                 workers->init_iterator(workers, &it);
			
 
				 
			
@@ -963,7 +968,7 @@ unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 
				 	return worker->nsched_ctxs > 1;
			
 
				 }
			
 
				 
			
 
				-unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
			
 
				+unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	if(max_time_worker_on_ctx == -1.0) return 1;
			
 
				 
			
@@ -971,7 +976,7 @@ unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 
				 	return worker->active_ctx == sched_ctx_id;
			
 
				 }
			
 
				 
			
 
				-void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
			
 
				+void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				 
			
@@ -996,7 +1001,7 @@ void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-double starpu_get_max_time_worker_on_ctx(void)
			
 
				+double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
			
 
				 {
			
 
				 	return max_time_worker_on_ctx;
			
 
				 }
			
@@ -1020,15 +1025,15 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
 
				-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops)
			
 
				+void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
			
 
				 	   && sched_ctx->perf_counters != NULL)
			
 
				-		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops);
			
 
				+		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops, data_size);
			
 
				 }
			
 
				 
			
 
				-void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
			
 
				+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 
			
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -91,7 +91,7 @@ struct _starpu_sched_ctx
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 	/* a structure containing a series of performance counters determining the resize procedure */
			
 
				-	struct starpu_performance_counters *perf_counters;
			
 
				+	struct starpu_sched_ctx_performance_counters *perf_counters;
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 };
			
 
				 
			
@@ -139,7 +139,7 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
				 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
			
 
				 
			
 
				 #if defined(_MSC_VER) || defined(STARPU_SIMGRID)
			
 
				-_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
			
 
				+_starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
			
 
				 #endif
			
 
				 
			
 
				 #endif // __SCHED_CONTEXT_H__
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -225,7 +225,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
				 	}
			
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				-	starpu_call_pushed_task_cb(workerid, task->sched_ctx);
			
 
				+	starpu_sched_ctx_call_pushed_task_cb(workerid, task->sched_ctx);
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
 
				 	if (is_basic_worker)
			
@@ -233,7 +233,6 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
				 		unsigned node = starpu_worker_get_memory_node(workerid);
			
 
				 		if (_starpu_task_uses_multiformat_handles(task))
			
 
				 		{
			
 
				-			unsigned i;
			
 
				 			for (i = 0; i < task->cl->nbuffers; i++)
			
 
				 			{
			
 
				 				struct starpu_task *conversion_task;
			
@@ -269,24 +268,24 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
				 
			
 
				 		int ret = 0;
			
 
				 
			
 
				-		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
			
 
				-		j->task_size = worker_size;
			
 
				-		j->combined_workerid = workerid;
			
 
				-		j->active_task_alias_count = 0;
			
 
				+		struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
			
 
				+		job->task_size = worker_size;
			
 
				+		job->combined_workerid = workerid;
			
 
				+		job->active_task_alias_count = 0;
			
 
				 
			
 
				-		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
			
 
				-		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
			
 
				+		_STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, worker_size);
			
 
				+		_STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, worker_size);
			
 
				 
			
 
				 		/* Note: we have to call that early, or else the task may have
			
 
				 		 * disappeared already */
			
 
				 		_starpu_push_task_end(task);
			
 
				 
			
 
				-		int i;
			
 
				-		for (i = 0; i < worker_size; i++)
			
 
				+		int j;
			
 
				+		for (j = 0; j < worker_size; j++)
			
 
				 		{
			
 
				 			struct starpu_task *alias = _starpu_create_task_alias(task);
			
 
				 
			
 
				-			worker = _starpu_get_worker_struct(combined_workerid[i]);
			
 
				+			worker = _starpu_get_worker_struct(combined_workerid[j]);
			
 
				 			ret |= _starpu_push_local_task(worker, alias, 0);
			
 
				 		}
			
 
				 
			
@@ -299,14 +298,14 @@ static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struc
 
				 	int worker = -1, nworkers = 0;
			
 
				 	struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
			
 
				 
			
 
				-	struct starpu_iterator it;
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
 
				 	while(workers->has_next(workers, &it))
			
 
				 	{
			
 
				 		worker = workers->get_next(workers, &it);
			
 
				-		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_is_ctxs_turn(worker, sched_ctx->id))
			
 
				+		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_sched_ctx_is_ctxs_turn(worker, sched_ctx->id))
			
 
				 			nworkers++;
			
 
				 	}
			
 
				 
			
@@ -563,7 +562,7 @@ pick:
 
				 	{
			
 
				 		struct _starpu_sched_ctx *sched_ctx;
			
 
				 
			
 
				-		unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
			
 
				+		//unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
			
 
				 
			
 
				 		int been_here[STARPU_NMAX_SCHED_CTXS];
			
 
				 		int i;
			
@@ -582,7 +581,7 @@ pick:
 
				 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
			
 
				 				{
			
 
				 					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
			
 
				-					lucky_ctx = sched_ctx->id;
			
 
				+					//lucky_ctx = sched_ctx->id;
			
 
				 				}
			
 
				 			}
			
 
				 
			
@@ -605,7 +604,7 @@ pick:
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 	struct _starpu_sched_ctx *sched_ctx = NULL;
			
 
				-	struct starpu_performance_counters *perf_counters = NULL;
			
 
				+	struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
			
 
				 	int j;
			
 
				 	for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
			
 
				 	{
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -365,7 +365,7 @@ int starpu_task_submit(struct starpu_task *task)
 
				 
			
 
				 	if (task->sched_ctx == 0 && nsched_ctxs != 1 && !j->exclude_from_dag)
			
 
				 	{
			
 
				-		set_sched_ctx = starpu_task_get_context();
			
 
				+		set_sched_ctx = starpu_sched_ctx_get_context();
			
 
				 		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				 			task->sched_ctx = set_sched_ctx;
			
 
				 	}
			
@@ -633,7 +633,7 @@ void starpu_display_codelet_stats(struct starpu_codelet *cl)
 
				 int starpu_task_wait_for_all(void)
			
 
				 {
			
 
				 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
			
 
				-	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_task_get_context();
			
 
				+	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_context();
			
 
				 
			
 
				 	/* if there is no indication about which context to wait,
			
 
				 	   we wait for all tasks submitted to starpu */
			
@@ -745,6 +745,11 @@ static void _starpu_increment_nsubmitted_tasks(void)
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
			
 
				 }
			
 
				 
			
 
				+int starpu_task_nsubmitted(void)
			
 
				+{
			
 
				+	return nsubmitted;
			
 
				+}
			
 
				+
			
 
				 void _starpu_increment_nready_tasks(void)
			
 
				 {
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
			
@@ -765,6 +770,11 @@ void _starpu_decrement_nready_tasks(void)
 
				 
			
 
				 }
			
 
				 
			
 
				+int starpu_task_nready(void)
			
 
				+{
			
 
				+	return nready;
			
 
				+}
			
 
				+
			
 
				 void _starpu_initialize_current_task_key(void)
			
 
				 {
			
 
				 	_STARPU_PTHREAD_KEY_CREATE(&current_task_key, NULL);
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -84,15 +84,15 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 
				 			switch (arch)
			
 
				 			{
			
 
				 			case STARPU_CPU_WORKER:
			
 
				-				if (task->cl->cpu_funcs[i] != NULL)
			
 
				+				if (task->cl->cpu_funcs[impl] != NULL)
			
 
				 					test_implementation = 1;
			
 
				 				break;
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				-				if (task->cl->cuda_funcs[i] != NULL)
			
 
				+				if (task->cl->cuda_funcs[impl] != NULL)
			
 
				 					test_implementation = 1;
			
 
				 				break;
			
 
				 			case STARPU_OPENCL_WORKER:
			
 
				-				if (task->cl->opencl_funcs[i] != NULL)
			
 
				+				if (task->cl->opencl_funcs[impl] != NULL)
			
 
				 					test_implementation = 1;
			
 
				 				break;
			
 
				 			default:
			
@@ -340,14 +340,14 @@ void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
 
				 
			
 
				 }
			
 
				 
			
 
				-static void _starpu_launch_drivers(struct _starpu_machine_config *config)
			
 
				+static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
			
 
				 {
			
 
				-	config->running = 1;
			
 
				-	config->submitting = 1;
			
 
				+	pconfig->running = 1;
			
 
				+	pconfig->submitting = 1;
			
 
				 
			
 
				 	_STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
			
 
				 
			
 
				-	unsigned nworkers = config->topology.nworkers;
			
 
				+	unsigned nworkers = pconfig->topology.nworkers;
			
 
				 
			
 
				 	/* Launch workers asynchronously */
			
 
				 	unsigned cpu = 0, cuda = 0;
			
@@ -368,9 +368,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		struct _starpu_worker *workerarg = &config->workers[worker];
			
 
				+		struct _starpu_worker *workerarg = &pconfig->workers[worker];
			
 
				 
			
 
				-		workerarg->config = config;
			
 
				+		workerarg->config = pconfig;
			
 
				 
			
 
				 		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
			
 
				 
			
@@ -388,7 +388,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 		workerarg->run_by_starpu = 1;
			
 
				 		workerarg->worker_is_running = 0;
			
 
				 		workerarg->worker_is_initialized = 0;
			
 
				-		
			
 
				+
			
 
				 		int ctx;
			
 
				 		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
			
 
				 			workerarg->removed_from_ctx[ctx] = 0;
			
@@ -419,7 +419,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 			case STARPU_CPU_WORKER:
			
 
				 				workerarg->set = NULL;
			
 
				 				driver.id.cpu_id = cpu;
			
 
				-				if (_starpu_may_launch_driver(config->conf, &driver))
			
 
				+				if (_starpu_may_launch_driver(pconfig->conf, &driver))
			
 
				 				{
			
 
				 					_STARPU_PTHREAD_CREATE_ON(
			
 
				 						workerarg->name,
			
@@ -446,7 +446,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 			case STARPU_CUDA_WORKER:
			
 
				 				workerarg->set = NULL;
			
 
				 				driver.id.cuda_id = cuda;
			
 
				-				if (_starpu_may_launch_driver(config->conf, &driver))
			
 
				+				if (_starpu_may_launch_driver(pconfig->conf, &driver))
			
 
				 				{
			
 
				 					_STARPU_PTHREAD_CREATE_ON(
			
 
				 						workerarg->name,
			
@@ -473,7 +473,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 			case STARPU_OPENCL_WORKER:
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
			
 
				-				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
			
 
				 				{
			
 
				 					workerarg->run_by_starpu = 0;
			
 
				 					break;
			
@@ -504,7 +504,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 	cuda = 0;
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-		struct _starpu_worker *workerarg = &config->workers[worker];
			
 
				+		struct _starpu_worker *workerarg = &pconfig->workers[worker];
			
 
				 		struct starpu_driver driver;
			
 
				 		driver.type = workerarg->arch;
			
 
				 
			
@@ -512,7 +512,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 		{
			
 
				 			case STARPU_CPU_WORKER:
			
 
				 				driver.id.cpu_id = cpu;
			
 
				-				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
			
 
				 				{
			
 
				 					cpu++;
			
 
				 					break;
			
@@ -526,7 +526,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 				break;
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				 				driver.id.cuda_id = cuda;
			
 
				-				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
			
 
				 				{
			
 
				 					cuda++;
			
 
				 					break;
			
@@ -542,7 +542,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 			case STARPU_OPENCL_WORKER:
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
			
 
				-				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
			
 
				 					break;
			
 
				 #endif
			
 
				 				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
			
@@ -817,19 +817,19 @@ void starpu_profiling_init()
 
				  * Handle runtime termination
			
 
				  */
			
 
				 
			
 
				-static void _starpu_terminate_workers(struct _starpu_machine_config *config)
			
 
				+static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
			
 
				 {
			
 
				 	int status STARPU_ATTRIBUTE_UNUSED;
			
 
				 	unsigned workerid;
			
 
				 
			
 
				-	for (workerid = 0; workerid < config->topology.nworkers; workerid++)
			
 
				+	for (workerid = 0; workerid < pconfig->topology.nworkers; workerid++)
			
 
				 	{
			
 
				 		starpu_wake_all_blocked_workers();
			
 
				 
			
 
				 		_STARPU_DEBUG("wait for worker %u\n", workerid);
			
 
				 
			
 
				-		struct _starpu_worker_set *set = config->workers[workerid].set;
			
 
				-		struct _starpu_worker *worker = &config->workers[workerid];
			
 
				+		struct _starpu_worker_set *set = pconfig->workers[workerid].set;
			
 
				+		struct _starpu_worker *worker = &pconfig->workers[workerid];
			
 
				 
			
 
				 		/* in case StarPU termination code is called from a callback,
			
 
				  		 * we have to check if pthread_self() is the worker itself */
			
@@ -914,10 +914,10 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static void _starpu_kill_all_workers(struct _starpu_machine_config *config)
			
 
				+static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
			
 
				 {
			
 
				 	/* set the flag which will tell workers to stop */
			
 
				-	config->running = 0;
			
 
				+	pconfig->running = 0;
			
 
				 	/* running is just protected by a memory barrier */
			
 
				 	STARPU_WMB();
			
 
				 	starpu_wake_all_blocked_workers();
			
@@ -1299,7 +1299,7 @@ int starpu_worker_get_nids_ctx_free_by_type(enum starpu_archtype type, int *work
 
				 				if(config.sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
			
 
				 				{
			
 
				 					struct starpu_sched_ctx_worker_collection *workers = config.sched_ctxs[s].workers;
			
 
				-					struct starpu_iterator it;
			
 
				+					struct starpu_sched_ctx_iterator it;
			
 
				 					if(workers->init_iterator)
			
 
				 						workers->init_iterator(workers, &it);
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -458,6 +458,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 			}
			
 
				 		}
			
 
				 		else
			
 
				+			/* The last request will perform the callback after termination */
			
 
				 			_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
 
				 
			
 
				 
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -134,8 +134,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 	{
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
			
 
				 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
			
 
				-		STARPU_ASSERT(copy_methods->ram_to_ram);
			
 
				-		copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				+		if (copy_methods->ram_to_ram)
			
 
				+			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				+		else
			
 
				+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				 		break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
			
@@ -143,11 +145,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 #if !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
			
 
				 #endif
			
 
				-		STARPU_ASSERT(copy_methods->cuda_to_ram);
			
 
				-		if (!req || !copy_methods->cuda_to_ram_async)
			
 
				+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
			
 
				+				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				-			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				+			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
			
 
				+			if (copy_methods->cuda_to_ram)
			
 
				+				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				+			else
			
 
				+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -156,7 +162,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			stream = starpu_cuda_get_local_out_transfer_stream();
			
 
				-			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			if (copy_methods->cuda_to_ram_async)
			
 
				+				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 
			
 
				 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -168,11 +180,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 #if !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
			
 
				 #endif
			
 
				-		STARPU_ASSERT(copy_methods->ram_to_cuda);
			
 
				-		if (!req || !copy_methods->ram_to_cuda_async)
			
 
				+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
			
 
				+				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				-			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				+			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
			
 
				+			if (copy_methods->ram_to_cuda)
			
 
				+				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				+			else
			
 
				+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -182,7 +198,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			stream = starpu_cuda_get_local_in_transfer_stream();
			
 
				-			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			if (copy_methods->ram_to_cuda_async)
			
 
				+				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 
			
 
				 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
@@ -191,12 +213,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 		break;
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
 
				 		/* CUDA - CUDA transfer */
			
 
				-		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
			
 
				-		if (!req || !copy_methods->cuda_to_cuda_async)
			
 
				+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
			
 
				+				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				-			STARPU_ASSERT(copy_methods->cuda_to_cuda);
			
 
				+			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				-			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				+			if (copy_methods->cuda_to_cuda)
			
 
				+				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				+			else
			
 
				+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -205,7 +230,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			stream = starpu_cuda_get_local_peer_transfer_stream();
			
 
				-			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			if (copy_methods->cuda_to_cuda_async)
			
 
				+				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 
			
 
				 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -215,54 +246,77 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
			
 
				 		/* OpenCL -> RAM */
			
 
				-		if (_starpu_memory_node_get_local_key() == src_node)
			
 
				+		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
			
 
				+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
			
 
				+				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				-			STARPU_ASSERT(copy_methods->opencl_to_ram);
			
 
				-			if (!req || !copy_methods->opencl_to_ram_async)
			
 
				-			{
			
 
				-				/* this is not associated to a request so it's synchronous */
			
 
				+			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
			
 
				+			/* this is not associated to a request so it's synchronous */
			
 
				+			if (copy_methods->opencl_to_ram)
			
 
				 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				-			}
			
 
				 			else
			
 
				-			{
			
 
				-				req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				-			}
			
 
				+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			/* we should not have a blocking call ! */
			
 
				-			STARPU_ABORT();
			
 
				+			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				+			if (copy_methods->opencl_to_ram_async)
			
 
				+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 		}
			
 
				 		break;
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
			
 
				 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
			
 
				-		STARPU_ASSERT(copy_methods->ram_to_opencl);
			
 
				-		if (!req || !copy_methods->ram_to_opencl_async)
			
 
				+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
			
 
				+				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				+			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				-			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				+			if (copy_methods->ram_to_opencl)
			
 
				+				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				+			else
			
 
				+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			if (copy_methods->ram_to_opencl_async)
			
 
				+				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 		}
			
 
				 		break;
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
			
 
				 		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
			
 
				-		STARPU_ASSERT(copy_methods->opencl_to_opencl);
			
 
				-		if (!req || !copy_methods->opencl_to_opencl_async)
			
 
				+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
			
 
				+				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				+			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				-			copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				+			if (copy_methods->opencl_to_opencl)
			
 
				+				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				+			else
			
 
				+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				-			ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			if (copy_methods->opencl_to_opencl_async)
			
 
				+				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 		}
			
 
				 		break;
			
 
				 #endif
			
@@ -331,6 +385,64 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/* This can be used by interfaces to easily transfer a piece of data without
			
 
				+ * caring about the particular CUDA/OpenCL methods.  */
			
 
				+
			
 
				+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
			
 
				+{
			
 
				+	struct _starpu_async_channel *async_channel = async_data;
			
 
				+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
			
 
				+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
			
 
				+
			
 
				+	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
			
 
				+	{
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
			
 
				+		memcpy((void *) dst + dst_offset, (void *) src + src_offset, size);
			
 
				+		return 0;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
			
 
				+		return starpu_cuda_copy_async_sync(
			
 
				+				(void*) src + src_offset, src_node,
			
 
				+				(void*) dst + dst_offset, dst_node,
			
 
				+				size,
			
 
				+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
			
 
				+				cudaMemcpyDeviceToHost);
			
 
				+
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
			
 
				+		return starpu_cuda_copy_async_sync(
			
 
				+				(void*) src + src_offset, src_node,
			
 
				+				(void*) dst + dst_offset, dst_node,
			
 
				+				size,
			
 
				+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
			
 
				+				cudaMemcpyHostToDevice);
			
 
				+
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
 
				+		return starpu_cuda_copy_async_sync(
			
 
				+				(void*) src + src_offset, src_node,
			
 
				+				(void*) dst + dst_offset, dst_node,
			
 
				+				size,
			
 
				+				async_channel?starpu_cuda_get_local_peer_transfer_stream():NULL,
			
 
				+				cudaMemcpyDeviceToDevice);
			
 
				+
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
			
 
				+		return starpu_opencl_copy_async_sync(
			
 
				+				src, src_offset, src_node,
			
 
				+				dst, dst_offset, dst_node,
			
 
				+				size,
			
 
				+				&async_channel->event.opencl_event);
			
 
				+#endif
			
 
				+	default:
			
 
				+		STARPU_ABORT();
			
 
				+		return -1;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
			
 
				 {
			
 
				 #ifdef STARPU_SIMGRID
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -31,31 +31,11 @@
 
				  * BCSR : blocked CSR, we use blocks of size (r x c)
			
 
				  */
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#endif
			
 
				-
			
 
				-static struct starpu_data_copy_methods bcsr_copy_data_methods_s =
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				+
			
 
				+static const struct starpu_data_copy_methods bcsr_copy_data_methods_s =
			
 
				 {
			
 
				-	.ram_to_ram = copy_ram_to_ram,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-#endif
			
 
				+	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
 
				 static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
			
@@ -315,105 +295,7 @@ static void free_bcsr_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_buffer_on_node(node, (uintptr_t) bcsr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	struct starpu_bcsr_interface *src_bcsr = src_interface;
			
 
				-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_bcsr->nnz;
			
 
				-	uint32_t nrow = src_bcsr->nrow;
			
 
				-	size_t elemsize = src_bcsr->elemsize;
			
 
				-
			
 
				-	uint32_t r = src_bcsr->r;
			
 
				-	uint32_t c = src_bcsr->c;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, kind);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), kind);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	struct starpu_bcsr_interface *src_bcsr = src_interface;
			
 
				-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_bcsr->nnz;
			
 
				-	uint32_t nrow = src_bcsr->nrow;
			
 
				-	size_t elemsize = src_bcsr->elemsize;
			
 
				-
			
 
				-	uint32_t r = src_bcsr->r;
			
 
				-	uint32_t c = src_bcsr->c;
			
 
				-
			
 
				-        int err;
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync(src_bcsr->nzval, src_node, 0, dst_bcsr->nzval, dst_node, 0, nnz*r*c*elemsize, NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->colind, src_node, 0, (uintptr_t)dst_bcsr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->rowptr, src_node, 0, (uintptr_t)dst_bcsr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-#endif // STARPU_USE_OPENCL
			
 
				-
			
 
				-/* as not all platform easily have a BLAS lib installed ... */
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
			
 
				 	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
			
@@ -425,13 +307,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 	uint32_t r = src_bcsr->r;
			
 
				 	uint32_t c = src_bcsr->c;
			
 
				 
			
 
				-	memcpy((void *)dst_bcsr->nzval, (void *)src_bcsr->nzval, nnz*elemsize*r*c);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (starpu_interface_copy(src_bcsr->nzval, 0, src_node, dst_bcsr->nzval, 0, dst_node, nnz*elemsize*r*c, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				-	memcpy((void *)dst_bcsr->colind, (void *)src_bcsr->colind, nnz*sizeof(uint32_t));
			
 
				+	if (starpu_interface_copy((uintptr_t)src_bcsr->colind, 0, src_node, (uintptr_t)dst_bcsr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				-	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				+	if (starpu_interface_copy((uintptr_t)src_bcsr->rowptr, 0, src_node, (uintptr_t)dst_bcsr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -44,7 +44,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_data_copy_methods block_copy_data_methods_s =
			
 
				+static const struct starpu_data_copy_methods block_copy_data_methods_s =
			
 
				 {
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -350,7 +350,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				+		/* Default case: we transfer all blocks one by one: nz transfers */
			
 
				 		unsigned layer;
			
 
				 		for (layer = 0; layer < src_block->nz; layer++)
			
 
				 		{
			
@@ -420,7 +420,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				+		/* Default case: we transfer all blocks one by one: nz 2D transfers */
			
 
				 		unsigned layer;
			
 
				 		for (layer = 0; layer < src_block->nz; layer++)
			
 
				 		{
			
@@ -514,8 +514,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
				 		/* Is that a single contiguous buffer ? */
			
 
				 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				-			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node, src_block->offset,
			
 
				-								dst_block->dev_handle, dst_node, dst_block->offset,
			
 
				+			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
			
 
				+								dst_block->dev_handle, dst_block->offset, dst_node,
			
 
				 							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				 							       event);
			
 
				                 }
			
@@ -535,10 +535,12 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
				                         unsigned j;
			
 
				                         for(j=0 ; j<src_block->ny ; j++)
			
 
				 			{
			
 
				-				ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node,
			
 
				+				ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
			
 
				 								    src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
			
 
				-								    dst_block->dev_handle, dst_node,
			
 
				+								    src_node,
			
 
				+								    dst_block->dev_handle,
			
 
				 								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
			
 
				+								    dst_node,
			
 
				 								       src_block->nx*src_block->elemsize,
			
 
				 								       event);
			
 
				                         }
			
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -19,190 +19,36 @@
 
				 #include <datawizard/memalloc.h>
			
 
				 
			
 
				 static int
			
 
				-copy_ram_to_ram(void *src_interface, STARPU_ATTRIBUTE_UNUSED unsigned src_node,
			
 
				-		void *dst_interface, STARPU_ATTRIBUTE_UNUSED unsigned dst_node)
			
 
				+copy_any_to_any(void *src_interface, unsigned src_node,
			
 
				+		void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	size_t size = 0;
			
 
				 	struct starpu_coo_interface *src_coo, *dst_coo;
			
 
				-
			
 
				-	src_coo = (struct starpu_coo_interface *) src_interface;
			
 
				-	dst_coo = (struct starpu_coo_interface *) dst_interface;
			
 
				-
			
 
				-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
			
 
				-	memcpy((void *) dst_coo->columns, (void *) src_coo->columns, size);
			
 
				-
			
 
				-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
			
 
				-	memcpy((void *) dst_coo->rows, (void *) src_coo->rows, size);
			
 
				-
			
 
				-	size = src_coo->n_values * src_coo->elemsize;
			
 
				-	memcpy((void *) dst_coo->values, (void *) src_coo->values, size);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
			
 
				-		src_coo->n_values *
			
 
				-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int
			
 
				-copy_cuda_async_sync(void *src_interface, unsigned src_node,
			
 
				-		     void *dst_interface, unsigned dst_node,
			
 
				-		     cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	int ret;
			
 
				-	size_t size = 0;
			
 
				-	struct starpu_coo_interface *src_coo, *dst_coo;
			
 
				-
			
 
				-	src_coo = (struct starpu_coo_interface *) src_interface;
			
 
				-	dst_coo = (struct starpu_coo_interface *) dst_interface;
			
 
				-
			
 
				-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
			
 
				-	ret = starpu_cuda_copy_async_sync(
			
 
				-		(void *) src_coo->columns,
			
 
				-		src_node,
			
 
				-		(void *) dst_coo->columns,
			
 
				-		dst_node,
			
 
				-		size,
			
 
				-		stream,
			
 
				-		kind);
			
 
				-	if (ret == 0)
			
 
				-		stream = NULL;
			
 
				-
			
 
				-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
			
 
				-	ret = starpu_cuda_copy_async_sync(
			
 
				-		(void *) src_coo->rows,
			
 
				-		src_node,
			
 
				-		(void *) dst_coo->rows,
			
 
				-		dst_node,
			
 
				-		size,
			
 
				-		stream,
			
 
				-		kind);
			
 
				-	if (ret == 0)
			
 
				-		stream = NULL;
			
 
				-
			
 
				-	size = src_coo->n_values * src_coo->elemsize;
			
 
				-	ret = starpu_cuda_copy_async_sync(
			
 
				-		(void *) src_coo->values,
			
 
				-		src_node,
			
 
				-		(void *) dst_coo->values,
			
 
				-		dst_node,
			
 
				-		size,
			
 
				-		stream,
			
 
				-		kind);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
			
 
				-		src_coo->n_values *
			
 
				-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_ram_to_cuda(void *src_interface, unsigned src_node,
			
 
				-		 void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    NULL, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_cuda_to_ram(void *src_interface, unsigned src_node,
			
 
				-		 void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    NULL, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				-		       void *dst_interface, unsigned dst_node,
			
 
				-		       cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    stream, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
			
 
				-		       void *dst_interface, unsigned dst_node,
			
 
				-		       cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    stream, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_cuda_to_cuda(void *src_interface, unsigned src_node,
			
 
				-		  void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    NULL, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-#ifdef NO_STRIDE
			
 
				-static int
			
 
				-copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				-			void *dst_interface, unsigned dst_node,
			
 
				-			cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    stream, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-#endif /* !NO_STRIDE */
			
 
				-#endif /* !STARPU_USE_CUDA */
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int
			
 
				-copy_opencl_common(void *src_interface, unsigned src_node,
			
 
				-			 void *dst_interface, unsigned dst_node,
			
 
				-			 cl_event *event)
			
 
				-{
			
 
				 	int ret = 0;
			
 
				-	size_t size = 0;
			
 
				-	struct starpu_coo_interface *src_coo, *dst_coo;
			
 
				 
			
 
				 	src_coo = (struct starpu_coo_interface *) src_interface;
			
 
				 	dst_coo = (struct starpu_coo_interface *) dst_interface;
			
 
				 
			
 
				-
			
 
				 	size = src_coo->n_values * sizeof(src_coo->columns[0]);
			
 
				-	ret = starpu_opencl_copy_async_sync(
			
 
				-		(uintptr_t) src_coo->columns,
			
 
				-		src_node,
			
 
				-		0,
			
 
				-		(uintptr_t) dst_coo->columns,
			
 
				-		dst_node,
			
 
				-		0,
			
 
				-		size,
			
 
				-		NULL);
			
 
				+	if (starpu_interface_copy(
			
 
				+		(uintptr_t) src_coo->columns, 0, src_node,
			
 
				+		(uintptr_t) dst_coo->columns, 0, dst_node,
			
 
				+		size, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
			
 
				-	ret = starpu_opencl_copy_async_sync(
			
 
				-		(uintptr_t) src_coo->rows,
			
 
				-		src_node,
			
 
				-		0,
			
 
				-		(uintptr_t) dst_coo->rows,
			
 
				-		dst_node,
			
 
				-		0,
			
 
				-		size,
			
 
				-		NULL);
			
 
				+	if (starpu_interface_copy(
			
 
				+		(uintptr_t) src_coo->rows, 0, src_node,
			
 
				+		(uintptr_t) dst_coo->rows, 0, dst_node,
			
 
				+		size, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	size = src_coo->n_values * src_coo->elemsize;
			
 
				-	ret = starpu_opencl_copy_async_sync(
			
 
				-		src_coo->values,
			
 
				-		src_node,
			
 
				-		0,
			
 
				-		(uintptr_t) dst_coo->values,
			
 
				-		dst_node,
			
 
				-		0,
			
 
				-		size,
			
 
				-		event);
			
 
				+	if (starpu_interface_copy(
			
 
				+		src_coo->values, 0, src_node,
			
 
				+		dst_coo->values, 0, dst_node,
			
 
				+		size, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
			
 
				 		src_coo->n_values *
			
@@ -211,83 +57,9 @@ copy_opencl_common(void *src_interface, unsigned src_node,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int
			
 
				-copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				-			 void *dst_interface, unsigned dst_node,
			
 
				-			 cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				-			 void *dst_interface, unsigned dst_node,
			
 
				-			 cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				-			 void *dst_interface, unsigned dst_node,
			
 
				-			 cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_ram_to_opencl(void *src_interface, unsigned src_node,
			
 
				-		   void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_ram_to_opencl_async(src_interface, src_node,
			
 
				-					dst_interface, dst_node,
			
 
				-					NULL);
			
 
				-}
			
 
				-static int
			
 
				-copy_opencl_to_ram(void *src_interface, unsigned src_node,
			
 
				-		   void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_to_ram_async(src_interface, src_node,
			
 
				-					dst_interface, dst_node,
			
 
				-					NULL);
			
 
				-}
			
 
				-static int
			
 
				-copy_opencl_to_opencl(void *src_interface, unsigned src_node,
			
 
				-		   void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_to_opencl_async(src_interface, src_node,
			
 
				-					dst_interface, dst_node,
			
 
				-					NULL);
			
 
				-}
			
 
				-#endif /* !STARPU_USE_OPENCL */
			
 
				-
			
 
				-static struct starpu_data_copy_methods coo_copy_data_methods =
			
 
				+static const struct starpu_data_copy_methods coo_copy_data_methods =
			
 
				 {
			
 
				-	.ram_to_ram          = copy_ram_to_ram,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda         = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram         = copy_cuda_to_ram,
			
 
				-	.ram_to_cuda_async   = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async   = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda        = copy_cuda_to_cuda,
			
 
				-#ifdef NO_STRIDE
			
 
				-	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
			
 
				-#endif
			
 
				-#else
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-#ifdef NO_STRIDE
			
 
				-	/* Enable GPU-GPU transfers in simgrid */
			
 
				-	.cuda_to_cuda_async = 1,
			
 
				-#endif
			
 
				-#endif
			
 
				-#endif /* !STARPU_USE_CUDA */
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl       = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram       = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl    = copy_opencl_to_opencl,
			
 
				-	.ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
			
 
				-#endif /* !STARPU_USE_OPENCL */
			
 
				+	.any_to_any          = copy_any_to_any,
			
 
				 };
			
 
				 
			
 
				 static void
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -28,42 +28,11 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-#endif
			
 
				-
			
 
				-static struct starpu_data_copy_methods csr_copy_data_methods_s =
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				+
			
 
				+static const struct starpu_data_copy_methods csr_copy_data_methods_s =
			
 
				 {
			
 
				-	.ram_to_ram = copy_ram_to_ram,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				-#else
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	/* Enable GPU-GPU transfers in simgrid */
			
 
				-	.cuda_to_cuda_async = 1,
			
 
				-#endif
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-#endif
			
 
				+	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
 
				 static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
			
@@ -293,188 +262,8 @@ static void free_csr_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_buffer_on_node(node, (uintptr_t) csr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_async_sync(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				-{
			
 
				-	struct starpu_csr_interface *src_csr = src_interface;
			
 
				-	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_csr->nnz;
			
 
				-	uint32_t nrow = src_csr->nrow;
			
 
				-	size_t elemsize = src_csr->elemsize;
			
 
				-
			
 
				-	cudaStream_t sstream = stream;
			
 
				-	int ret;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, sstream, kind);
			
 
				-	if (ret == 0) sstream = NULL;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), sstream, kind);
			
 
				-	if (ret == 0) sstream = NULL;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), sstream, kind);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				-	struct starpu_csr_interface *src_csr = src_interface;
			
 
				-	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_csr->nnz;
			
 
				-	uint32_t nrow = src_csr->nrow;
			
 
				-	size_t elemsize = src_csr->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	int src_dev = _starpu_memory_node_get_devid(src_node);
			
 
				-	int dst_dev = _starpu_memory_node_get_devid(dst_node);
			
 
				-
			
 
				-	int synchronous_fallback = 0;
			
 
				-
			
 
				-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
			
 
				-	if (cures)
			
 
				-	{
			
 
				-		synchronous_fallback = 1;
			
 
				-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-
			
 
				-	if (!synchronous_fallback)
			
 
				-	{
			
 
				-		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
			
 
				-	}
			
 
				-
			
 
				-	if (synchronous_fallback || cures != cudaSuccess)
			
 
				-	{
			
 
				-		synchronous_fallback = 1;
			
 
				-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-
			
 
				-	if (!synchronous_fallback)
			
 
				-	{
			
 
				-		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
			
 
				-	}
			
 
				-
			
 
				-	if (synchronous_fallback || cures != cudaSuccess)
			
 
				-	{
			
 
				-		synchronous_fallback = 1;
			
 
				-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-
			
 
				-	if (synchronous_fallback)
			
 
				-	{
			
 
				-		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-		return 0;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-		return -EAGAIN;
			
 
				-	}
			
 
				-#else
			
 
				-	/* Illegal without Peer tranfers */
			
 
				-	STARPU_ABORT();
			
 
				-	return 0;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	if (src_node == dst_node)
			
 
				-		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
			
 
				-	else
			
 
				-		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				-}
			
 
				-
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	struct starpu_csr_interface *src_csr = src_interface;
			
 
				-	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_csr->nnz;
			
 
				-	uint32_t nrow = src_csr->nrow;
			
 
				-	size_t elemsize = src_csr->elemsize;
			
 
				-
			
 
				-        int err;
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync(src_csr->nzval, src_node, 0, dst_csr->nzval, dst_node, 0, nnz*elemsize, NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->colind, src_node, 0, (uintptr_t)dst_csr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
			
 
				-        if (STARPU_UNLIKELY(err))
			
 
				-                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->rowptr, src_node, 0, (uintptr_t)dst_csr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-#endif // STARPU_USE_OPENCL
			
 
				-
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
			
 
				 	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
			
@@ -482,14 +271,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
 
				 	size_t elemsize = src_csr->elemsize;
			
 
				+	int ret = 0;
			
 
				 
			
 
				-	memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
			
 
				+	if (starpu_interface_copy(src_csr->nzval, 0, src_node, dst_csr->nzval, 0, dst_node, nnz*elemsize, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				-	memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
			
 
				+	if (starpu_interface_copy((uintptr_t)src_csr->colind, 0, src_node, (uintptr_t)dst_csr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				-	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				+	if (starpu_interface_copy((uintptr_t)src_csr->rowptr, 0, src_node, (uintptr_t)dst_csr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -290,41 +290,6 @@ void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 
				 	STARPU_ASSERT(handleptr);
			
 
				 	*handleptr = handle;
			
 
				 
			
 
				-	int asynchronous_copy_disabled = starpu_asynchronous_copy_disabled();
			
 
				-	if (STARPU_UNLIKELY(asynchronous_copy_disabled))
			
 
				-	{
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		ops->copy_methods->ram_to_cuda_async = NULL;
			
 
				-		ops->copy_methods->cuda_to_ram_async = NULL;
			
 
				-		ops->copy_methods->cuda_to_cuda_async = NULL;
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-		ops->copy_methods->ram_to_opencl_async = NULL;
			
 
				-		ops->copy_methods->opencl_to_ram_async = NULL;
			
 
				-		ops->copy_methods->opencl_to_opencl_async = NULL;
			
 
				-#endif
			
 
				-	}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	int asynchronous_cuda_copy_disabled = starpu_asynchronous_cuda_copy_disabled();
			
 
				-	if (STARPU_UNLIKELY(asynchronous_cuda_copy_disabled))
			
 
				-	{
			
 
				-		ops->copy_methods->ram_to_cuda_async = NULL;
			
 
				-		ops->copy_methods->cuda_to_ram_async = NULL;
			
 
				-		ops->copy_methods->cuda_to_cuda_async = NULL;
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	int asynchronous_opencl_copy_disabled = starpu_asynchronous_opencl_copy_disabled();
			
 
				-	if (STARPU_UNLIKELY(asynchronous_opencl_copy_disabled))
			
 
				-	{
			
 
				-		ops->copy_methods->ram_to_opencl_async = NULL;
			
 
				-		ops->copy_methods->opencl_to_ram_async = NULL;
			
 
				-		ops->copy_methods->opencl_to_opencl_async = NULL;
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				 	/* fill the interface fields with the appropriate method */
			
 
				 	STARPU_ASSERT(ops->register_data_handle);
			
 
				 	ops->register_data_handle(handle, home_node, data_interface);
			
@@ -618,7 +583,8 @@ void starpu_data_unregister_no_coherency(starpu_data_handle_t handle)
 
				 	_starpu_data_unregister(handle, 0);
			
 
				 }
			
 
				 
			
 
				-void starpu_data_unregister_submit(starpu_data_handle_t handle) {
			
 
				+void starpu_data_unregister_submit(starpu_data_handle_t handle)
			
 
				+{
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data must not be unregistered twice");
			
 
				 	handle->lazy_unregister = 1;
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -48,7 +48,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
				 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_data_copy_methods matrix_copy_data_methods_s =
			
 
				+static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
			
 
				 {
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -516,8 +516,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
				 
			
 
				 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
			
 
				 
			
 
				-	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_node, src_matrix->offset,
			
 
				-					    dst_matrix->dev_handle, dst_node, dst_matrix->offset,
			
 
				+	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_matrix->offset, src_node,
			
 
				+					    dst_matrix->dev_handle, dst_matrix->offset, dst_node,
			
 
				 					    src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				 					    event);
			
 
				 
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -42,7 +42,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
				 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_data_copy_methods multiformat_copy_data_methods_s =
			
 
				+static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
			
 
				 {
			
 
				 	.ram_to_ram = copy_ram_to_ram,
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c