13 lat temu · f4fb7d745d
--- a/ChangeLog
+++ b/ChangeLog
@@ -93,6 +93,18 @@ New features:
 
																   * Introduce new variables STARPU_LIMIT_CUDA_devid_MEM and
															
 
																     STARPU_LIMIT_OPENCL_devid_MEM to limit memory per specific device
															
 
																+Small features:
															
 
																+  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
															
 
																+  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
															
 
																+    pause trace recording.
															
 
																+  * Add trace_buffer_size configuration field to permit to specify the tracing
															
 
																+    buffer size.
															
 
																+  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
															
 
																+    the profile of a codelet.
															
 
																+  * File STARPU-REVISION --- containing the SVN revision number from which
															
 
																+    StarPU was compiled --- is installed in the share/doc/starpu directory
															
 
																+  * starpu_perfmodel_plot can now directly draw GFlops curves.
															
 
																+
															
 
																 Changes:
															
 
																   * Fix the block filter functions.
															
 
																   * Fix StarPU-MPI on Darwin.
															
@@ -127,15 +139,6 @@ Changes:
 
																   * StarPU can now use poti to generate paje traces.
															
 
																   * Rename scheduling policy "parallel greedy" to "parallel eager"
															
 
																-Small features:
															
 
																-  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
															
 
																-  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
															
 
																-  pause trace recording.
															
 
																-  * Add trace_buffer_size configuration field to permit to specify the tracing
															
 
																-  buffer size.
															
 
																-  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
															
 
																-  the profile of a codelet.
															
 
																-
															
 
																 Small changes:
															
 
																   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
															
 
																 	still available for compatibility reasons.
															
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																 # Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -91,7 +91,7 @@ all-local:
 
																 	cd starpu-top ; $(QMAKE) ; $(MAKE)
															
 
																 clean-local:
															
 
																 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
															
 
																-	$(RM) starpu_top.1 starpu-top/starpu_top
															
 
																+	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top
															
 
																 # TODO: resources
															
 
																 install-exec-local:
															
 
																 	$(MKDIR_P) $(DESTDIR)$(bindir)
															
@@ -102,10 +102,10 @@ uninstall-local:
 
																 	$(RM) starpu-top/Makefile
															
 
																 if STARPU_HAVE_HELP2MAN
															
 
																-starpu_top.1: starpu-top/starpu_top$(EXEEXT)
															
 
																+starpu-top/starpu_top.1: starpu-top/starpu_top$(EXEEXT)
															
 
																 	help2man --no-discard-stderr -N --output=$@ starpu-top/starpu_top$(EXEEXT)
															
 
																 dist_man1_MANS =\
															
 
																-	starpu_top.1
															
 
																+	starpu-top/starpu_top.1
															
 
																 endif
															
 
																 endif
															
@@ -114,8 +114,8 @@ txtdir = ${prefix}
 
																 else
															
 
																 txtdir = ${docdir}
															
 
																 endif
															
 
																-txt_DATA = AUTHORS COPYING.LGPL README
															
 
																-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION build-aux/svn2cl.xsl
															
 
																+txt_DATA = AUTHORS COPYING.LGPL README STARPU-REVISION
															
 
																+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl
															
 
																 include starpu-top/extradist
															
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,21 @@ AC_C_RESTRICT
 
																 # Check if bash is available
															
 
																 AC_CHECK_PROGS([BASH], [bash])
															
 
																+# Check whether subversion is installed
															
 
																+AC_PATH_PROG(svnversioncommand, svnversion)
															
 
																+
															
 
																+# use svnversion to record the current repository revision only if
															
 
																+# subversion is installed and we are in a working copy
															
 
																+if test "$svnversioncommand" = "" || test `LC_ALL=C $svnversioncommand -n $srcdir` = "exported" ; then
															
 
																+   if test -f $srcdir/STARPU-REVISION ; then
															
 
																+      cp $srcdir/STARPU-REVISION .
															
 
																+   else
															
 
																+      echo "unknown" > ./STARPU-REVISION
															
 
																+   fi
															
 
																+else
															
 
																+   LC_ALL=C svnversion $srcdir > ./STARPU-REVISION
															
 
																+fi
															
 
																+
															
 
																 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
															
 
																 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
															
 
																 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
															
@@ -1327,12 +1342,17 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 
																 IS_SUPPORTED_CFLAG(-W)
															
 
																 IS_SUPPORTED_CFLAG(-Wall)
															
 
																 IS_SUPPORTED_CFLAG(-Wextra)
															
 
																-AC_SUBST(GLOBAL_AM_CFLAGS)
															
 
																+IS_SUPPORTED_CFLAG(-Werror=implicit)
															
 
																 if test "x$STARPU_DEVEL" != x; then
															
 
																 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
															
 
																+	IS_SUPPORTED_CFLAG(-Wunused)
															
 
																+	IS_SUPPORTED_CFLAG(-Wundef)
															
 
																+	IS_SUPPORTED_CFLAG(-Wshadow)
															
 
																 fi
															
 
																+AC_SUBST(GLOBAL_AM_CFLAGS)
															
 
																+
															
 
																 # Same value as Automake's, for use in other places.
															
 
																 pkglibdir="\${libdir}/$PACKAGE"
															
 
																 AC_SUBST([pkglibdir])
															
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																 # Copyright (C) 2009, 2011  Université de Bordeaux 1
															
 
																-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # Permission is granted to copy, distribute and/or modify this document
															
 
																 # under the terms of the GNU Free Documentation License, Version 1.3
															
@@ -12,7 +12,7 @@
 
																 info_TEXINFOS = starpu.texi
															
 
																-starpu_TEXINFOS = chapters/advanced-api.texi \
															
 
																+chapters =	chapters/advanced-api.texi \
															
 
																 	chapters/benchmarks.texi \
															
 
																 	chapters/configuration.texi \
															
 
																 	chapters/perf-feedback.texi \
															
@@ -35,9 +35,12 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 
																 	chapters/using.texi \
															
 
																 	chapters/vector_scal_opencl.texi \
															
 
																 	chapters/socl.texi \
															
 
																-	chapters/version.texi \
															
 
																 	chapters/sched_ctx_hypervisor.texi
															
 
																+starpu_TEXINFOS = 		\
															
 
																+	chapters/version.texi 	\
															
 
																+	$(chapters)
															
 
																+
															
 
																 MAINTAINERCLEANFILES = starpu.pdf starpu.html
															
 
																 EXTRA_DIST = starpu.css
															
@@ -50,7 +53,7 @@ AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers -
 
																 uninstall-local:
															
 
																 	$(RM) $(DESTDIR)$(infodir)/dir
															
 
																-chapters/version.texi:
															
 
																+chapters/version.texi: $(chapters)
															
 
																 	@-for f in $(starpu_TEXINFOS) ; do \
															
 
																                 if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f 2>/dev/null ; fi \
															
 
																         done | sort -r | head -1 > timestamp
															
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -38,7 +38,7 @@ The arguments following the codelets can be of the following types:
 
																 @item
															
 
																 the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
															
 
																 @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
															
 
																-@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
															
 
																+@code{STARPU_PRIORITY}, @code{STARPU_TAG}, @code{STARPU_FLOPS}, followed by the appropriated objects
															
 
																 as defined below.
															
 
																 @end itemize
															
@@ -85,6 +85,12 @@ this macro is used when calling @code{starpu_insert_task}, and must be
 
																 followed by a tag.
															
 
																 @end defmac
															
 
																+@defmac STARPU_FLOPS
															
 
																+this macro is used when calling @code{starpu_insert_task}, and must be followed
															
 
																+by an amount of floating point operations, as a double. The user may have to
															
 
																+explicitly cast into double, otherwise parameter passing will not work.
															
 
																+@end defmac
															
 
																+
															
 
																 @deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
															
 
																 Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
															
 
																 given to a codelet and later unpacked with the function
															
@@ -165,24 +171,6 @@ to the world size. Communications statistics must be enabled
 
																 @node Communication
															
 
																 @subsection Communication
															
 
																-The standard point to point communications of MPI have been
															
 
																-implemented. The semantic is similar to the MPI one, but adapted to
															
 
																-the DSM provided by StarPU. A MPI request will only be submitted when
															
 
																-the data is available in the main memory of the node submitting the
															
 
																-request.
															
 
																-
															
 
																-There is two types of asynchronous communications: the classic
															
 
																-asynchronous communications and the detached communications. The
															
 
																-classic asynchronous communications (@code{starpu_mpi_isend} and
															
 
																-@code{starpu_mpi_irecv}) need to be followed by a call to
															
 
																-@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
															
 
																-test the completion of the communication. Waiting for or testing the
															
 
																-completion of detached communications is not possible, this is done
															
 
																-internally by StarPU-MPI, on completion, the resources are
															
 
																-automatically released. This mechanism is similar to the pthread
															
 
																-detach state attribute which determines whether a thread will be
															
 
																-created in a joinable or a detached state.
															
 
																-
															
 
																 @deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
															
 
																 Performs a standard-mode, blocking send of @var{data_handle} to the
															
 
																 node @var{dest} using the message tag @code{mpi_tag} within the
															
@@ -354,51 +342,56 @@ Unpack the data handle from the contiguous buffer at the address @code{ptr} of s
 
																 @end deftp
															
 
																 @deftp {Data Type} {struct starpu_data_copy_methods}
															
 
																-Defines the per-interface methods.
															
 
																+Defines the per-interface methods. If the @code{any_to_any} method is provided,
															
 
																+it will be used by default if no more specific method is provided. It can still
															
 
																+be useful to provide more specific method in case of e.g. available particular
															
 
																+CUDA or OpenCL support.
															
 
																+
															
 
																 @table @asis
															
 
																-@item @code{int @{ram,cuda,opencl@}_to_@{ram,cuda,opencl@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
															
 
																+@item @code{int (*@{ram,cuda,opencl@}_to_@{ram,cuda,opencl@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
															
 
																 These 12 functions define how to copy data from the @var{src_interface}
															
 
																 interface on the @var{src_node} node to the @var{dst_interface} interface
															
 
																 on the @var{dst_node} node. They return 0 on success.
															
 
																-@item @code{int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
															
 
																-Define how to copy data from the @var{src_interface} interface on the
															
 
																-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
															
 
																-@var{dst_node} node (on a CUDA device), using the given @var{stream}. Return 0
															
 
																-on success.
															
 
																-
															
 
																-@item @code{int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
															
 
																-Define how to copy data from the @var{src_interface} interface on the
															
 
																-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on the
															
 
																-@var{dst_node} node (in RAM), using the given @var{stream}. Return 0
															
 
																-on success.
															
 
																-
															
 
																-@item @code{int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
															
 
																+@item @code{int (*@{ram,cuda@}_to_@{ram,cuda@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
															
 
																+These 3 functions (@code{ram_to_ram} is not among these) define how to copy
															
 
																+data from the @var{src_interface} interface on the @var{src_node} node to the
															
 
																+@var{dst_interface} interface on the @var{dst_node} node, using the given
															
 
																+@var{stream}. Must return 0 if the transfer was actually completed completely
															
 
																+synchronously, or -EAGAIN if at least some transfers are still ongoing and
															
 
																+should be awaited for by the core.
															
 
																+
															
 
																+@item @code{int (*@{ram,opencl@}_to_@{ram,opencl@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
															
 
																+These 3 functions (@code{ram_to_ram} is not among them) define how to copy
															
 
																+data from the @var{src_interface} interface on the @var{src_node} node to the
															
 
																+@var{dst_interface} interface on the @var{dst_node} node, by recording in
															
 
																+@var{event}, a pointer to a cl_event, the event of the last submitted transfer.
															
 
																+Must return 0 if the transfer was actually completed completely synchronously,
															
 
																+or -EAGAIN if at least some transfers are still ongoing and should be awaited
															
 
																+for by the core.
															
 
																+
															
 
																+@item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
															
 
																 Define how to copy data from the @var{src_interface} interface on the
															
 
																-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on
															
 
																-the @var{dst_node} node (on another CUDA device), using the given @var{stream}.
															
 
																-Return 0 on success.
															
 
																+@var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
															
 
																+node. This is meant to be implemented through the @var{starpu_interface_copy}
															
 
																+helper, to which @var{async_data} should be passed as such, and will be used to
															
 
																+manage asynchronicity. This must return -EAGAIN if any of the
															
 
																+@var{starpu_interface_copy} calls has returned -EAGAIN (i.e. at least some
															
 
																+transfer is still ongoing), and return 0 otherwise.
															
 
																-@item @code{int (*ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
															
 
																-Define how to copy data from the @var{src_interface} interface on the
															
 
																-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
															
 
																-@var{dst_node} node (on an OpenCL device), using @var{event}, a pointer to a
															
 
																-cl_event. Return 0 on success.
															
 
																-
															
 
																-@item @code{int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
															
 
																-Define how to copy data from the @var{src_interface} interface on the
															
 
																-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
															
 
																-on the @var{dst_node} node (in RAM), using the given @var{event}, a pointer to
															
 
																-a cl_event. Return 0 on success.
															
 
																-
															
 
																-@item @code{int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
															
 
																-Define how to copy data from the @var{src_interface} interface on the
															
 
																-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
															
 
																-on the @var{dst_node} node (on another OpenCL device), using the given
															
 
																-@var{event}, a pointer to a cl_event. Return 0 on success.
															
 
																 @end table
															
 
																 @end deftp
															
 
																+@deftypefun int starpu_interface_copy (uintptr_t @var{src}, size_t @var{src_offset}, unsigned @var{src_node}, uintptr_t @var{dst}, size_t @var{dst_offset}, unsigned @var{dst_node}, size_t @var{size}, {void *}@var{async_data})
															
 
																+Copy @var{size} bytes from byte offset @var{src_offset} of @var{src} on
															
 
																+@var{src_node} to byte offset @var{dst_offset} of @var{dst} on @var{dst_node}.
															
 
																+This is to be used in the @var{any_to_any} copy method, which is provided with
															
 
																+the @var{async_data} to be pased to @var{starpu_interface_copy}. this returns
															
 
																+-EAGAIN if the transfer is still ongoing, or 0 if the transfer is already
															
 
																+completed.
															
 
																+@end deftypefun
															
 
																+
															
 
																+
															
 
																 @deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
															
 
																 Compute the CRC of a byte buffer seeded by the inputcrc "current
															
 
																 state". The return value should be considered as the new "current
															
@@ -457,7 +450,7 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
 
																                 .nx = nx
															
 
																         @};
															
 
																-        if (interface_complex_ops.interfaceid == -1)
															
 
																+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
															
 
																         @{
															
 
																                 interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
															
 
																         @}
															
@@ -483,7 +476,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
																         .copy_methods = &complex_copy_methods,
															
 
																         .get_size = complex_get_size,
															
 
																         .footprint = complex_footprint,
															
 
																-        .interfaceid = -1,
															
 
																+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
															
 
																         .interface_size = sizeof(struct starpu_complex_interface),
															
 
																 @};
															
 
																 @end smallexample
															
@@ -837,7 +830,7 @@ The number of workerids
 
																 @item @code{pthread_key_t cursor_key} (optional)
															
 
																 The cursor needed to iterate the collection (depending on the data structure)
															
 
																 @item @code{int type}
															
 
																-The type of structure (currently STARPU_WORKER_LIST is the only one available)
															
 
																+The type of structure (currently STARPU_SCHED_CTX_WORKER_LIST is the only one available)
															
 
																 @item @code{unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers)}
															
 
																 Checks if there is a next worker
															
 
																 @item @code{int (*get_next)(struct starpu_sched_ctx_worker_collection *workers)}
															
@@ -870,15 +863,15 @@ Delete the worker collection of the specified scheduling context
 
																 Return the worker collection managed by the indicated context
															
 
																 @end deftypefun
															
 
																-@deftypefun pthread_mutex_t* starpu_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
															
 
																+@deftypefun pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
															
 
																 TODO
															
 
																 @end deftypefun
															
 
																-@deftypefun void starpu_task_set_context (unsigned *@var{sched_ctx_id})
															
 
																+@deftypefun void starpu_sched_ctx_set_context (unsigned *@var{sched_ctx_id})
															
 
																 Set the scheduling context the subsequent tasks will be submitted to
															
 
																 @end deftypefun
															
 
																-@deftypefun unsigned starpu_task_get_context (void)
															
 
																+@deftypefun unsigned starpu_sched_ctx_get_context (void)
															
 
																 Return the scheduling context the tasks are currently submitted to
															
 
																 @end deftypefun
															
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -1849,6 +1849,11 @@ A pointer to the next task. This should only be used by StarPU.
 
																 This is only used for tasks that use multiformat handle. This should only be
															
 
																 used by StarPU.
															
 
																+@item @code{double flops}
															
 
																+This can be set to the number of floating points operations that the task
															
 
																+will have to achieve. This is useful for easily getting GFlops curves from
															
 
																+@code{starpu_perfmodel_plot}, and for the hypervisor load balancing.
															
 
																+
															
 
																 @item @code{void *starpu_private}
															
 
																 This is private to StarPU, do not modify. If the task is allocated by hand
															
 
																 (without starpu_task_create), this field should be set to NULL.
															
@@ -1857,6 +1862,7 @@ This is private to StarPU, do not modify. If the task is allocated by hand
 
																 This field is set when initializing a task. It prevents a task from being
															
 
																 submitted if it has not been properly initialized.
															
 
																 @end table
															
 
																+
															
 
																 @end deftp
															
 
																 @deftypefun void starpu_task_init ({struct starpu_task} *@var{task})
															
@@ -1939,6 +1945,18 @@ This function blocks until all the tasks that were submitted are terminated. It
 
																 does not destroy these tasks.
															
 
																 @end deftypefun
															
 
																+@deftypefun int starpu_task_nready (void)
															
 
																+@end deftypefun
															
 
																+
															
 
																+@deftypefun int starpu_task_nsubmitted (void)
															
 
																+Return the number of submitted tasks which have not completed yet.
															
 
																+@end deftypefun
															
 
																+
															
 
																+@deftypefun int starpu_task_nready (void)
															
 
																+Return the number of submitted tasks which are ready for execution are already
															
 
																+executing. It thus does not include tasks waiting for dependencies.
															
 
																+@end deftypefun
															
 
																+
															
 
																 @deftypefun {struct starpu_task *} starpu_task_get_current (void)
															
 
																 This function returns the task currently executed by the worker, or
															
 
																 NULL if it is called either from a thread that is not a task or simply
															
@@ -2489,10 +2507,6 @@ This function returns a pointer to device properties for worker @var{workerid}
 
																 (assumed to be a CUDA worker).
															
 
																 @end deftypefun
															
 
																-@deftypefun size_t starpu_cuda_get_global_mem_size (unsigned @var{devid})
															
 
																-Return the size of the global memory of CUDA device @var{devid}.
															
 
																-@end deftypefun
															
 
																-
															
 
																 @deftypefun void starpu_cuda_report_error ({const char *}@var{func}, {const char *}@var{file}, int @var{line}, cudaError_t @var{status})
															
 
																 Report a CUDA error.
															
 
																 @end deftypefun
															
@@ -2560,10 +2574,6 @@ OpenCL as shown in @ref{Full source code for the 'Scaling a Vector' example}.
 
																 @node Writing OpenCL kernels
															
 
																 @subsection Writing OpenCL kernels
															
 
																-@deftypefun size_t starpu_opencl_get_global_mem_size (int @var{devid})
															
 
																-Return the size of global device memory in bytes.
															
 
																-@end deftypefun
															
 
																-
															
 
																 @deftypefun void starpu_opencl_get_context (int @var{devid}, {cl_context *}@var{context})
															
 
																 Places the OpenCL context of the device designated by @var{devid} into @var{context}.
															
 
																 @end deftypefun
															
@@ -2780,7 +2790,7 @@ otherwise. The integer pointed to by @var{ret} is set to -EAGAIN if the asynchro
 
																 was successful, or to 0 if event was NULL.
															
 
																 @end deftypefun
															
 
																-@deftypefun cl_int starpu_opencl_copy_async_sync (cl_mem @var{src}, unsigned @var{src_node}, size_t @var{src_offset}, cl_mem @var{dst}, unsigned @var{dst_node}, size_t @var{dst_offset}, size_t @var{size}, {cl_event *}@var{event})
															
 
																+@deftypefun cl_int starpu_opencl_copy_async_sync (uintptr_t @var{src}, size_t @var{src_offset}, unsigned @var{src_node}, uintptr_t @var{dst}, size_t @var{dst_offset}, unsigned @var{dst_node}, size_t @var{size}, {cl_event *}@var{event})
															
 
																 Copy @var{size} bytes from byte offset @var{src_offset} of
															
 
																 @var{src} on @var{src_node} to byte offset @var{dst_offset} of @var{dst} on
															
 
																 @var{dst_node}. if @var{event} is NULL, the copy is synchronous, i.e the queue is
															
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -417,8 +417,20 @@ the @code{starpu_conf} structure passed to @code{starpu_init} is set.
 
																 @defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
															
 
																 If set, StarPU will create several workers which won't be able to work
															
 
																-concurrently. It will create combined workers which size goes from 1 to the
															
 
																-total number of CPU workers in the system.
															
 
																+concurrently. It will by default create combined workers which size goes from 1
															
 
																+to the total number of CPU workers in the system. @code{STARPU_MIN_WORKERSIZE}
															
 
																+and @code{STARPU_MAX_WORKERSIZE} can be used to change this default.
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} @code{STARPU_MIN_WORKERSIZE}
															
 
																+When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MIN_WORKERSIZE}
															
 
																+permits to specify the minimum size of the combined workers (instead of the default 1)
															
 
																+@end defvr
															
 
																+
															
 
																+@defvr {Environment variable} @code{STARPU_MAX_WORKERSIZE}
															
 
																+When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MAX_WORKERSIZE}
															
 
																+permits to specify the minimum size of the combined workers (instead of the
															
 
																+number of CPU workers in the system)
															
 
																 @end defvr
															
 
																 @defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
															
--- a/doc/chapters/mpi-support.texi
+++ b/doc/chapters/mpi-support.texi
@@ -21,6 +21,7 @@ according to the task graph and an application-provided distribution.
 
																 @menu
															
 
																 * Simple Example::
															
 
																+* Point to point communication::
															
 
																 * Exchanging User Defined Data Interface::
															
 
																 * MPI Insert Task Utility::
															
 
																 * MPI Collective Operations::
															
@@ -120,7 +121,49 @@ int main(int argc, char **argv)
 
																 @end smallexample
															
 
																 @end cartouche
															
 
																-@page
															
 
																+@node Point to point communication
															
 
																+@section Point to point communication
															
 
																+
															
 
																+The standard point to point communications of MPI have been
															
 
																+implemented. The semantic is similar to the MPI one, but adapted to
															
 
																+the DSM provided by StarPU. A MPI request will only be submitted when
															
 
																+the data is available in the main memory of the node submitting the
															
 
																+request.
															
 
																+
															
 
																+There is two types of asynchronous communications: the classic
															
 
																+asynchronous communications and the detached communications. The
															
 
																+classic asynchronous communications (@code{starpu_mpi_isend} and
															
 
																+@code{starpu_mpi_irecv}) need to be followed by a call to
															
 
																+@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
															
 
																+test the completion of the communication. Waiting for or testing the
															
 
																+completion of detached communications is not possible, this is done
															
 
																+internally by StarPU-MPI, on completion, the resources are
															
 
																+automatically released. This mechanism is similar to the pthread
															
 
																+detach state attribute which determines whether a thread will be
															
 
																+created in a joinable or a detached state.
															
 
																+
															
 
																+For any communication, the call of the function will result in the
															
 
																+creation of a StarPU-MPI request, the function
															
 
																+@code{starpu_data_acquire_cb} is then called to asynchronously request
															
 
																+StarPU to fetch the data in main memory; when the data is available in
															
 
																+main memory, a StarPU-MPI function is called to put the new request in
															
 
																+the list of the ready requests.
															
 
																+
															
 
																+The StarPU-MPI progression thread regularly polls this list of ready
															
 
																+requests. For each new ready request, the appropriate function is
															
 
																+called to post the corresponding MPI call. For example, calling
															
 
																+@code{starpu_mpi_isend} will result in posting @code{MPI_Isend}. If
															
 
																+the request is marked as detached, the request will be put in the list
															
 
																+of detached requests.
															
 
																+
															
 
																+The StarPU-MPI progression thread also polls the list of detached
															
 
																+requests. For each detached request, it regularly tests the completion
															
 
																+of the MPI request by calling @code{MPI_Test}. On completion, the data
															
 
																+handle is released, and if a callback was defined, it is called.
															
 
																+
															
 
																+@ref{Communication} gives the list of all the point to point
															
 
																+communications defined in StarPU-MPI.
															
 
																+
															
 
																 @node Exchanging User Defined Data Interface
															
 
																 @section Exchanging User Defined Data Interface
															
--- a/doc/chapters/perf-feedback.texi
+++ b/doc/chapters/perf-feedback.texi
@@ -411,7 +411,7 @@ display the regression formula, and in the case of non-linear regression, the
 
																 same performance log as for history-based performance models:
															
 
																 @example
															
 
																-$ starpu_perfmodel_display -s non_linear_memset_regression_based.type
															
 
																+$ starpu_perfmodel_display -s non_linear_memset_regression_based
															
 
																 performance model for cpu_impl_0
															
 
																 	Regression : #sample = 1400
															
 
																 	Linear: y = alpha size ^ beta
															
@@ -429,15 +429,25 @@ a3d3725e	4096           	4.763200e+00   	7.650928e-01   	100
 
																 ...
															
 
																 @end example
															
 
																-The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
															
 
																-It writes a @code{.gp} file in the current directory, to be run in the
															
 
																-@code{gnuplot} tool, which shows the corresponding curve.
															
 
																-
															
 
																 The same can also be achieved by using StarPU's library API, see
															
 
																 @ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
															
 
																 function. The source code of the @code{starpu_perfmodel_display} tool can be a
															
 
																 useful example.
															
 
																+The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
															
 
																+It writes a @code{.gp} file in the current directory, to be run in the
															
 
																+@code{gnuplot} tool, which shows the corresponding curve.
															
 
																+
															
 
																+When the @code{flops} field of tasks is set, @code{starpu_perfmodel_plot} can
															
 
																+directly draw a GFlops curve, by simply adding the @code{-f} option:
															
 
																+
															
 
																+@example
															
 
																+$ starpu_perfmodel_display -f -s chol_model_11
															
 
																+@end example
															
 
																+
															
 
																+This will however disable displaying the regression model, for which we can not
															
 
																+compute GFlops.
															
 
																+
															
 
																 When the FxT trace file @code{filename} has been generated, it is possible to
															
 
																 get a profiling of each codelet by calling:
															
 
																 @example
															
@@ -453,10 +463,10 @@ This is also available in the @code{starpu_perfmodel_plot} tool, by passing it
 
																 the fxt trace:
															
 
																 @example
															
 
																-$ starpu_perfmodel_display -s non_linear_memset_regression_based.type -i /tmp/prof_file_foo_0
															
 
																+$ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_foo_0
															
 
																 @end example
															
 
																-It willd produce a @code{.gp} file which contains both the performance model
															
 
																+It will produce a @code{.gp} file which contains both the performance model
															
 
																 curves, and the profiling measurements.
															
 
																 If you have the R statistical tool installed, you can additionally use
															
--- a/doc/chapters/perf-optimization.texi
+++ b/doc/chapters/perf-optimization.texi
@@ -228,7 +228,7 @@ int workerids[3] = @{1, 3, 10@};
 
																 int id_ctx = starpu_sched_ctx_create("heft", workerids, 3, "my_ctx");
															
 
																 /* @b{let StarPU know that the folowing tasks will be submitted to this context} */
															
 
																-starpu_task_set_context(id);
															
 
																+starpu_sched_ctx_set_task_context(id);
															
 
																 /* @b{submit the task to StarPU} */
															
 
																 starpu_task_submit(task);
															
@@ -548,6 +548,11 @@ The number of devices can be chosen as usual with @code{STARPU_NCPU},
 
																 cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
															
 
																 lower than the real number on the current machine.
															
 
																+The amount of simulated GPU memory is for now unbound by default, but
															
 
																+it can be chosen by hand through the @code{STARPU_LIMIT_CUDA_MEM},
															
 
																+@code{STARPU_LIMIT_CUDA_devid_MEM}, @code{STARPU_LIMIT_OPENCL_MEM}, and
															
 
																+@code{STARPU_LIMIT_OPENCL_devid_MEM} environment variables.
															
 
																+
															
 
																 The Simgrid default stack size is small; to increase it use the
															
 
																 parameter @code{--cfg=contexts/stack_size}, for example:
															
--- a/doc/chapters/sched_ctx_hypervisor.texi
+++ b/doc/chapters/sched_ctx_hypervisor.texi
@@ -27,7 +27,7 @@ Basic strategies of resizing scheduling contexts already exist but a platform fo
 
																 @section Managing the hypervisor
															
 
																 There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
															
 
																-@deftypefun {struct starpu_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
															
 
																+@deftypefun {struct starpu_sched_ctx_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
															
 
																 Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
															
 
																 These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
															
 
																 @end deftypefun
															
@@ -200,7 +200,7 @@ or
 
																 @smallexample
															
 
																 starpu_insert_task(&codelet,
															
 
																                     ...,
															
 
																-                    STARPU_FLOPS, 100,
															
 
																+                    STARPU_FLOPS, (double) 100,
															
 
																                     0);
															
 
																 @end smallexample
															
 
																 @end cartouche
															
@@ -210,8 +210,8 @@ starpu_insert_task(&codelet,
 
																 The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
															
 
																-@deftp {Data Type} {struct starpu_performance_counters}
															
 
																-@anchor{struct starpu_performance_counters}
															
 
																+@deftp {Data Type} {struct starpu_sched_ctx_performance_counters}
															
 
																+@anchor{struct starpu_sched_ctx_performance_counters}
															
 
																 @table @asis
															
 
																 @item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -16,7 +16,7 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) -Werror=implicit
															
 
																+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
															
 
																 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
															
 
																 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
															
--- a/examples/basic_examples/block_cpu.c
+++ b/examples/basic_examples/block_cpu.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -26,7 +26,7 @@ void cpu_codelet(void *descr[], void *_args)
 
																         unsigned ldy = STARPU_BLOCK_GET_LDY(descr[0]);
															
 
																         unsigned ldz = STARPU_BLOCK_GET_LDZ(descr[0]);
															
 
																         float *multiplier = (float *)_args;
															
 
																-        unsigned i, j, k;
															
 
																+        int i, j, k;
															
 
																         for(k=0; k<nz ; k++)
															
 
																 	{
															
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -55,6 +55,64 @@
 
																 #define BLAS3_FLOP(n1,n2,n3)    \
															
 
																         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
															
 
																+/* This is from magma
															
 
																+
															
 
																+  -- Innovative Computing Laboratory
															
 
																+  -- Electrical Engineering and Computer Science Department
															
 
																+  -- University of Tennessee
															
 
																+  -- (C) Copyright 2009
															
 
																+
															
 
																+  Redistribution  and  use  in  source and binary forms, with or without
															
 
																+  modification,  are  permitted  provided  that the following conditions
															
 
																+  are met:
															
 
																+
															
 
																+  * Redistributions  of  source  code  must  retain  the above copyright
															
 
																+    notice,  this  list  of  conditions  and  the  following  disclaimer.
															
 
																+  * Redistributions  in  binary  form must reproduce the above copyright
															
 
																+    notice,  this list of conditions and the following disclaimer in the
															
 
																+    documentation  and/or other materials provided with the distribution.
															
 
																+  * Neither  the  name of the University of Tennessee, Knoxville nor the
															
 
																+    names of its contributors may be used to endorse or promote products
															
 
																+    derived from this software without specific prior written permission.
															
 
																+
															
 
																+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
															
 
																+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
															
 
																+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
															
 
																+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
															
 
																+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
															
 
																+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
															
 
																+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
															
 
																+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
															
 
																+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
															
 
																+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
															
 
																+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
															
 
																+
															
 
																+  */
															
 
																+
															
 
																+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
															
 
																+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
															
 
																+
															
 
																+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
															
 
																+
															
 
																+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
															
 
																+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
															
 
																+
															
 
																+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
															
 
																+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
															
 
																+
															
 
																+#define FMULS_TRSM FMULS_TRMM
															
 
																+#define FADDS_TRSM FMULS_TRMM
															
 
																+
															
 
																+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
															
 
																+
															
 
																+
															
 
																+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
															
 
																+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
															
 
																+
															
 
																+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
															
 
																+
															
 
																+/* End of magma code */
															
 
																+
															
 
																 static unsigned size = 4*1024;
															
 
																 static unsigned nblocks = 16;
															
 
																 static unsigned nbigblocks = 8;
															
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
@@ -68,6 +68,9 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 
																 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_SPOTRF(n);
															
 
																+
															
 
																 	return task;
															
 
																 }
															
@@ -110,6 +113,9 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 
																 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_STRSM(n, n);
															
 
																+
															
 
																 	ret = starpu_task_submit(task);
															
 
																 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																 	return ret;
															
@@ -157,6 +163,9 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 
																 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_SGEMM(n, n, n);
															
 
																+
															
 
																 	ret = starpu_task_submit(task);
															
 
																 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																 	return ret;
															
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -85,6 +85,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 	double end;
															
 
																 	unsigned i,j,k;
															
 
																+	unsigned long n = starpu_matrix_get_nx(dataA);
															
 
																+	unsigned long nn = n/nblocks;
															
 
																 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
															
@@ -101,6 +103,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 					 STARPU_PRIORITY, prio_level,
															
 
																 					 STARPU_RW, sdatakk,
															
 
																 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
															
 
																+					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
															
 
																 					 0);
															
 
																 		if (ret == -ENODEV) return 77;
															
 
																 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
@@ -113,6 +116,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
															
 
																 						 STARPU_R, sdatakk,
															
 
																 						 STARPU_RW, sdatakj,
															
 
																+						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
															
 
																 						 0);
															
 
																 			if (ret == -ENODEV) return 77;
															
 
																 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
@@ -129,6 +133,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 								 STARPU_R, sdataki,
															
 
																 								 STARPU_R, sdatakj,
															
 
																 								 STARPU_RW, sdataij,
															
 
																+								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
															
 
																 								 0);
															
 
																 					if (ret == -ENODEV) return 77;
															
 
																 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
															
@@ -144,9 +149,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
																 	end = starpu_timing_now();
															
 
																 	double timing = end - start;
															
 
																-	unsigned long n = starpu_matrix_get_nx(dataA);
															
 
																-	double flop = (1.0f*n*n*n)/3.0f;
															
 
																+	double flop = FLOPS_SPOTRF(n);
															
 
																 	if(with_ctxs || with_noctxs || chole1 || chole2)
															
 
																 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
															
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
@@ -69,6 +69,9 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 
																 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_SPOTRF(n);
															
 
																+
															
 
																 	return task;
															
 
																 }
															
@@ -109,6 +112,9 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 
																 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_STRSM(n, n);
															
 
																+
															
 
																 	int ret = starpu_task_submit(task);
															
 
																         if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 	{
															
@@ -158,6 +164,9 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 
																 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_SGEMM(n, n, n);
															
 
																+
															
 
																 	int ret = starpu_task_submit(task);
															
 
																         if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 	{
															
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -71,6 +71,9 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 
																 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_SPOTRF(n);
															
 
																+
															
 
																 	return task;
															
 
																 }
															
@@ -113,6 +116,9 @@ static int create_task_21(unsigned k, unsigned j)
 
																 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_STRSM(n, n);
															
 
																+
															
 
																 	ret = starpu_task_submit(task);
															
 
																 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																 	return ret;
															
@@ -160,6 +166,9 @@ static int create_task_22(unsigned k, unsigned i, unsigned j)
 
																 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
															
 
																 	}
															
 
																+	int n = starpu_matrix_get_nx(task->handles[0]);
															
 
																+	task->flops = FLOPS_SGEMM(n, n, n);
															
 
																+
															
 
																 	ret = starpu_task_submit(task);
															
 
																 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																 	return ret;
															
--- a/examples/filters/custom_mf/custom_conversion_codelets.c
+++ b/examples/filters/custom_mf/custom_conversion_codelets.c
@@ -21,7 +21,7 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 void cuda_to_cpu(void *buffers[], void *arg)
															
 
																 {
															
 
																-	unsigned int n = CUSTOM_GET_NX(buffers[0]);
															
 
																+	int n = CUSTOM_GET_NX(buffers[0]);
															
 
																 	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
															
 
																 	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
															
 
																 	struct point *aop;
															
@@ -60,7 +60,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
															
 
																 {
															
 
																-	unsigned int n = CUSTOM_GET_NX(buffers[0]);
															
 
																+	int n = CUSTOM_GET_NX(buffers[0]);
															
 
																 	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
															
 
																 	struct point *aop;
															
 
																 	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
															
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -46,7 +46,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 
																 				    cl_event *event);
															
 
																 #endif /* !STARPU_USE_OPENCL */
															
 
																-static struct starpu_data_copy_methods custom_copy_data_methods_s =
															
 
																+static const struct starpu_data_copy_methods custom_copy_data_methods_s =
															
 
																 {
															
 
																 	.ram_to_ram = NULL,
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -98,7 +98,7 @@ static struct starpu_data_interface_ops interface_custom_ops =
 
																 	.get_size              = custom_interface_get_size,
															
 
																 	.footprint             = footprint_custom_interface_crc32,
															
 
																 	.compare               = NULL,
															
 
																-	.interfaceid           = -1,
															
 
																+	.interfaceid           = STARPU_UNKNOWN_INTERFACE_ID,
															
 
																 	.interface_size        = sizeof(struct custom_data_interface),
															
 
																 	.display               = display_custom_interface,
															
 
																 	.is_multiformat        = 1,
															
@@ -276,7 +276,8 @@ void custom_data_register(starpu_data_handle_t *handle,
 
																 		.ops = format_ops
															
 
																 	};
															
 
																-	if (interface_custom_ops.interfaceid == -1) {
															
 
																+	if (interface_custom_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
															
 
																+	{
															
 
																 		interface_custom_ops.interfaceid = starpu_data_interface_get_next_id();
															
 
																 	}
															
 
																 	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);
															
--- a/examples/filters/fblock_cpu.c
+++ b/examples/filters/fblock_cpu.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -18,7 +18,7 @@
 
																 void cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																-        unsigned i, j, k;
															
 
																+        int i, j, k;
															
 
																         int *factor = (int *) cl_arg;
															
 
																 	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
															
 
																 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
															
--- a/examples/filters/fblock_opencl.c
+++ b/examples/filters/fblock_opencl.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -17,13 +17,13 @@
 
																 #include <starpu.h>
															
 
																-#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
															
 
																-do                                                          \
															
 
																-{							    \
															
 
																-	int err;                                            \
															
 
																-	err = clSetKernelArg(kernel, n, size, ptr);         \
															
 
																-	if (err != CL_SUCCESS)                              \
															
 
																-       		STARPU_OPENCL_REPORT_ERROR(err);            \
															
 
																+#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       	\
															
 
																+do                                                          	\
															
 
																+{								\
															
 
																+	int check_err;                           	        \
															
 
																+	check_err = clSetKernelArg(kernel, n, size, ptr);       \
															
 
																+	if (check_err != CL_SUCCESS)                            \
															
 
																+       		STARPU_OPENCL_REPORT_ERROR(check_err);          \
															
 
																 } while (0)
															
 
																 extern struct starpu_opencl_program opencl_program;
															
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -43,9 +43,9 @@ void cpu_func(void *buffers[], void *cl_arg)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	unsigned i, j, n=1;
															
 
																+	unsigned j, n=1;
															
 
																         int matrix[NX*NY];
															
 
																-	int ret;
															
 
																+	int ret, i;
															
 
																         FPRINTF(stderr,"IN  Matrix: \n");
															
 
																         for(j=0 ; j<NY ; j++)
															
--- a/examples/filters/fvector.c
+++ b/examples/filters/fvector.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -37,7 +37,7 @@ void cpu_func(void *buffers[], void *cl_arg)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	unsigned i;
															
 
																+	int i;
															
 
																         int vector[NX];
															
 
																         starpu_data_handle_t handle;
															
 
																         int factor=1;
															
--- a/examples/filters/shadow.c
+++ b/examples/filters/shadow.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -91,11 +91,11 @@ void cuda_func(void *buffers[], void *cl_arg)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	unsigned i, j;
															
 
																+	unsigned j;
															
 
																         int vector[NX + 2*SHADOW];
															
 
																         int vector2[NX + PARTS*2*SHADOW];
															
 
																 	starpu_data_handle_t handle, handle2;
															
 
																-	int ret;
															
 
																+	int ret, i;
															
 
																         struct starpu_codelet cl =
															
 
																 	{
															
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -146,139 +146,30 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 
																 	return 0;
															
 
																 }
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
															
 
																+static int copy_any_to_any(void *src_interface, unsigned src_node,
															
 
																+			   void *dst_interface, unsigned dst_node,
															
 
																+			   void *async_data)
															
 
																 {
															
 
																 	struct starpu_complex_interface *src_complex = src_interface;
															
 
																 	struct starpu_complex_interface *dst_complex = dst_interface;
															
 
																-
															
 
																-	cudaStream_t sstream = stream;
															
 
																-	int ret;
															
 
																-
															
 
																-	ret = starpu_cuda_copy_async_sync((void *)src_complex->real, src_node, (void *)dst_complex->real, dst_node,
															
 
																-					  src_complex->nx*sizeof(src_complex->real[0]), sstream, kind);
															
 
																-	if (ret == 0) sstream = NULL;
															
 
																-
															
 
																-	ret = starpu_cuda_copy_async_sync((char *)src_complex->imaginary, src_node, (char *)dst_complex->imaginary, dst_node,
															
 
																-					  src_complex->nx*sizeof(src_complex->imaginary[0]), sstream, kind);
															
 
																+	int ret = 0;
															
 
																+
															
 
																+	if (starpu_interface_copy((uintptr_t) src_complex->real, 0, src_node,
															
 
																+				    (uintptr_t) dst_complex->real, 0, dst_node,
															
 
																+				     src_complex->nx*sizeof(src_complex->real[0]),
															
 
																+				     async_data))
															
 
																+		ret = -EAGAIN;
															
 
																+	if (starpu_interface_copy((uintptr_t) src_complex->imaginary, 0, src_node,
															
 
																+				    (uintptr_t) dst_complex->imaginary, 0, dst_node,
															
 
																+				     src_complex->nx*sizeof(src_complex->imaginary[0]),
															
 
																+				     async_data))
															
 
																+		ret = -EAGAIN;
															
 
																 	return ret;
															
 
																 }
															
 
																-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-     return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
															
 
																-}
															
 
																-
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
															
 
																-}
															
 
																-#endif
															
 
																-
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
															
 
																-{
															
 
																-	struct starpu_complex_interface *src_complex = src_interface;
															
 
																-	struct starpu_complex_interface *dst_complex = dst_interface;
															
 
																-	cl_int err;
															
 
																-	int ret;
															
 
																-
															
 
																-	err = starpu_opencl_copy_ram_to_opencl(src_complex->real,
															
 
																-					       src_node,
															
 
																-					       (cl_mem) dst_complex->real,
															
 
																-					       dst_node,
															
 
																-					       src_complex->nx * sizeof(src_complex->real[0]),
															
 
																-					       0,
															
 
																-					       event,
															
 
																-					       &ret);
															
 
																-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-	if (ret == 0)
															
 
																-		event = NULL;
															
 
																-
															
 
																-	err = starpu_opencl_copy_ram_to_opencl(src_complex->imaginary,
															
 
																-					       src_node,
															
 
																-					       (cl_mem) dst_complex->imaginary,
															
 
																-					       dst_node,
															
 
																-					       src_complex->nx * sizeof(src_complex->imaginary[0]),
															
 
																-					       0,
															
 
																-					       event,
															
 
																-					       &ret);
															
 
																-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-	return ret;
															
 
																-}
															
 
																-
															
 
																-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																-}
															
 
																-
															
 
																-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
															
 
																-{
															
 
																-	struct starpu_complex_interface *src_complex = src_interface;
															
 
																-	struct starpu_complex_interface *dst_complex = dst_interface;
															
 
																-	cl_int err;
															
 
																-	int ret;
															
 
																-
															
 
																-	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->real,
															
 
																-					       src_node,
															
 
																-					       dst_complex->real,
															
 
																-					       dst_node,
															
 
																-					       src_complex->nx * sizeof(src_complex->real[0]),
															
 
																-					       0,
															
 
																-					       event,
															
 
																-					       &ret);
															
 
																-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-	if (ret == 0)
															
 
																-		event = NULL;
															
 
																-
															
 
																-	err = starpu_opencl_copy_opencl_to_ram((cl_mem) src_complex->imaginary,
															
 
																-					       src_node,
															
 
																-					       dst_complex->imaginary,
															
 
																-					       dst_node,
															
 
																-					       src_complex->nx * sizeof(src_complex->imaginary[0]),
															
 
																-					       0,
															
 
																-					       event,
															
 
																-					       &ret);
															
 
																-	if (STARPU_UNLIKELY(err != CL_SUCCESS))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-	return ret;
															
 
																-}
															
 
																-
															
 
																-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																-}
															
 
																-#endif
															
 
																-
															
 
																-static struct starpu_data_copy_methods complex_copy_methods =
															
 
																+static const struct starpu_data_copy_methods complex_copy_methods =
															
 
																 {
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	.ram_to_cuda = copy_ram_to_cuda,
															
 
																-	.cuda_to_ram = copy_cuda_to_ram,
															
 
																-	.ram_to_cuda_async = copy_ram_to_cuda_async,
															
 
																-	.cuda_to_ram_async = copy_cuda_to_ram_async,
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-	.ram_to_opencl = copy_ram_to_opencl,
															
 
																-	.opencl_to_ram = copy_opencl_to_ram,
															
 
																-	.ram_to_opencl_async = copy_ram_to_opencl_async,
															
 
																-	.opencl_to_ram_async = copy_opencl_to_ram_async,
															
 
																-#endif
															
 
																+	.any_to_any = copy_any_to_any
															
 
																 };
															
 
																 static struct starpu_data_interface_ops interface_complex_ops =
															
@@ -289,7 +180,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
																 	.copy_methods = &complex_copy_methods,
															
 
																 	.get_size = complex_get_size,
															
 
																 	.footprint = complex_footprint,
															
 
																-	.interfaceid = -1,
															
 
																+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
															
 
																 	.interface_size = sizeof(struct starpu_complex_interface),
															
 
																 	.handle_to_pointer = complex_handle_to_pointer,
															
 
																 	.pack_data = complex_pack_data,
															
@@ -305,7 +196,7 @@ void starpu_complex_data_register(starpu_data_handle_t *handleptr, unsigned home
 
																 		.nx = nx
															
 
																 	};
															
 
																-	if (interface_complex_ops.interfaceid == -1)
															
 
																+	if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
															
 
																 	{
															
 
																 		interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
															
 
																 	}
															
--- a/examples/ppm_downscaler/ppm_downscaler.c
+++ b/examples/ppm_downscaler/ppm_downscaler.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -75,7 +75,7 @@ struct ppm_image *file_to_ppm(char *filename)
 
																 	ret = fread(ppm->data, sizeof(struct ppm_color), ppm->ncols*ppm->nlines, file);
															
 
																 	STARPU_ASSERT(ret == ppm->ncols*ppm->nlines);
															
 
																-	unsigned i;
															
 
																+	int i;
															
 
																 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
															
 
																 	{
															
 
																 /*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
															
@@ -121,7 +121,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 
																 	struct ppm_color *in = input_ppm->data;
															
 
																 	struct ppm_color *out = output_ppm->data;
															
 
																-	unsigned line, col;
															
 
																+	int line, col;
															
 
																 	for (line = 0; line < output_ppm->nlines; line++)
															
 
																 	{
															
 
																 		for (col = 0; col < output_ppm->ncols; col++)
															
--- a/examples/profiling/profiling.c
+++ b/examples/profiling/profiling.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -107,7 +107,7 @@ int main(int argc, char **argv)
 
																 	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
															
 
																 	/* Display the occupancy of all workers during the test */
															
 
																-	int worker;
															
 
																+	unsigned worker;
															
 
																 	for (worker = 0; worker < starpu_worker_get_count(); worker++)
															
 
																 	{
															
 
																 		struct starpu_worker_profiling_info worker_info;
															
--- a/examples/sched_ctx/sched_ctx.c
+++ b/examples/sched_ctx/sched_ctx.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 
																 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
															
 
																 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
															
 
																-	unsigned i;
															
 
																+	int i;
															
 
																 	for (i = 0; i < ntasks/2; i++)
															
 
																 	{
															
 
																 		struct starpu_task *task = starpu_task_create();
															
--- a/examples/sched_ctx_utils/sched_ctx_utils.c
+++ b/examples/sched_ctx_utils/sched_ctx_utils.c
@@ -93,7 +93,7 @@ void* start_bench(void *val)
 
																 	pthread_setspecific(key, &p->id);
															
 
																 	if(p->ctx != 0)
															
 
																-		starpu_task_set_context(&p->ctx);
															
 
																+		starpu_sched_ctx_set_context(&p->ctx);
															
 
																 	for(i = 0; i < NSAMPLES; i++)
															
 
																 		p->bench(p->size, p->nblocks);
															
--- a/examples/scheduler/dummy_sched.c
+++ b/examples/scheduler/dummy_sched.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -28,7 +28,7 @@ typedef struct dummy_sched_data {
 
																 static void init_dummy_sched(unsigned sched_ctx_id)
															
 
																 {
															
 
																-	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
															
 
																+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_SCHED_CTX_WORKER_LIST);
															
 
																 	struct dummy_sched_data *data = (struct dummy_sched_data*)malloc(sizeof(struct dummy_sched_data));
															
@@ -70,7 +70,7 @@ static int push_task_dummy(struct starpu_task *task)
 
																 	   of them would pop for tasks */
															
 
																 	unsigned worker = 0;
															
 
																 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
@@ -159,7 +159,7 @@ int main(int argc, char **argv)
 
																 	ntasks /= 100;
															
 
																 #endif
															
 
																-	unsigned i;
															
 
																+	int i;
															
 
																 	for (i = 0; i < ntasks; i++)
															
 
																 	{
															
 
																 		struct starpu_task *task = starpu_task_create();
															
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -13,10 +13,10 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
															
 
																 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
															
 
																 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
															
 
																-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
															
 
																+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
															
 
																 if USE_MPI
															
 
																 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
--- a/examples/stencil/life.c
+++ b/examples/stencil/life.c
@@ -20,7 +20,7 @@
 
																 void life_update(int bz, const TYPE *old, TYPE *newp, int nx, int ny, int nz, int ldy, int ldz, int iter)
															
 
																 {
															
 
																-	unsigned x, y, z, num, alive;
															
 
																+	int x, y, z, num, alive;
															
 
																 	for (z = iter; z < nz - iter; z++)
															
 
																 	{
															
--- a/examples/stencil/stencil-blocks.c
+++ b/examples/stencil/stencil-blocks.c
@@ -121,7 +121,7 @@ struct block_description *get_block_description(int z)
 
																 	return &blocks[z];
															
 
																 }
															
 
																-unsigned get_block_mpi_node(int z)
															
 
																+int get_block_mpi_node(int z)
															
 
																 {
															
 
																 	z = (z + nbz)%nbz;
															
 
																 	return blocks[z].mpi_node;
															
@@ -277,7 +277,7 @@ void allocate_memory_on_node(int rank)
 
																 	{
															
 
																 		struct block_description *block = get_block_description(bz);
															
 
																-		unsigned node = block->mpi_node;
															
 
																+		int node = block->mpi_node;
															
 
																 		unsigned size_bz = block_sizes_z[bz];
															
@@ -301,7 +301,7 @@ void allocate_memory_on_node(int rank)
 
																 		}
															
 
																 		/* Boundary blocks : Top */
															
 
																-		unsigned top_node = block->boundary_blocks[T]->mpi_node;
															
 
																+		int top_node = block->boundary_blocks[T]->mpi_node;
															
 
																 		if ((node == rank) || (top_node == rank))
															
 
																 		{
															
 
																 			allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
															
@@ -311,7 +311,7 @@ void allocate_memory_on_node(int rank)
 
																 		} 
															
 
																 		/* Boundary blocks : Bottom */
															
 
																-		unsigned bottom_node = block->boundary_blocks[B]->mpi_node;
															
 
																+		int bottom_node = block->boundary_blocks[B]->mpi_node;
															
 
																 		if ((node == rank) || (bottom_node == rank))
															
 
																 		{
															
 
																 			allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
															
@@ -330,7 +330,7 @@ void check(int rank)
 
																 	{
															
 
																 		struct block_description *block = get_block_description(bz);
															
 
																-		unsigned node = block->mpi_node;
															
 
																+		int node = block->mpi_node;
															
 
																 		/* Main blocks */
															
 
																 		if (node == rank)
															
--- a/examples/stencil/stencil-tasks.c
+++ b/examples/stencil/stencil-tasks.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -40,7 +40,7 @@
 
																  */
															
 
																 /* R(z) = R(z+d) = local, just call the save kernel */
															
 
																-static void create_task_save_local(unsigned iter, unsigned z, int dir, unsigned local_rank)
															
 
																+static void create_task_save_local(unsigned iter, unsigned z, int dir, int local_rank)
															
 
																 {
															
 
																 	struct starpu_task *save_task = starpu_task_create();
															
 
																 	struct block_description *descr = get_block_description(z);
															
@@ -81,7 +81,7 @@ static void send_done(void *arg)
 
																 #ifdef STARPU_USE_MPI
															
 
																 /* Post MPI send */
															
 
																-static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, unsigned local_rank)
															
 
																+static void create_task_save_mpi_send(unsigned iter, unsigned z, int dir, int local_rank)
															
 
																 {
															
 
																 	struct block_description *descr = get_block_description(z);
															
 
																 	STARPU_ASSERT(descr->mpi_node == local_rank);
															
@@ -108,7 +108,7 @@ static void recv_done(void *arg)
 
																 }
															
 
																 /* Post MPI recv */
															
 
																-static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsigned local_rank)
															
 
																+static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, int local_rank)
															
 
																 {
															
 
																 	struct block_description *descr = get_block_description(z);
															
 
																 	STARPU_ASSERT(descr->mpi_node != local_rank);
															
@@ -129,10 +129,10 @@ static void create_task_save_mpi_recv(unsigned iter, unsigned z, int dir, unsign
 
																 /*
															
 
																  * Schedule saving boundaries of blocks to communication buffers
															
 
																  */
															
 
																-void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
															
 
																+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank)
															
 
																 {
															
 
																-	unsigned node_z = get_block_mpi_node(z);
															
 
																-	unsigned node_z_and_d = get_block_mpi_node(z+dir);
															
 
																+	int node_z = get_block_mpi_node(z);
															
 
																+	int node_z_and_d = get_block_mpi_node(z+dir);
															
 
																 #ifdef STARPU_USE_MPI
															
 
																 	if (node_z == local_rank)
															
@@ -168,7 +168,7 @@ void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank)
 
																  * Schedule update computation in computation buffer
															
 
																  */
															
 
																-void create_task_update(unsigned iter, unsigned z, unsigned local_rank)
															
 
																+void create_task_update(unsigned iter, unsigned z, int local_rank)
															
 
																 {
															
 
																 	STARPU_ASSERT(iter != 0);
															
@@ -253,8 +253,8 @@ void create_start_task(int z, int dir)
 
																  */
															
 
																 void create_tasks(int rank)
															
 
																 {
															
 
																-	unsigned iter;
															
 
																-	unsigned bz;
															
 
																+	int iter;
															
 
																+	int bz;
															
 
																 	int niter = get_niter();
															
 
																 	int nbz = get_nbz();
															
@@ -288,7 +288,7 @@ void create_tasks(int rank)
 
																  */
															
 
																 void wait_end_tasks(int rank)
															
 
																 {
															
 
																-	unsigned bz;
															
 
																+	int bz;
															
 
																 	int nbz = get_nbz();
															
 
																 	for (bz = 0; bz < nbz; bz++)
															
--- a/examples/stencil/stencil.c
+++ b/examples/stencil/stencil.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -221,7 +221,7 @@ int main(int argc, char **argv)
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																 #ifdef STARPU_USE_MPI
															
 
																-	starpu_mpi_initialize();
															
 
																+	starpu_mpi_init(NULL, NULL, 0);
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
@@ -312,8 +312,8 @@ int main(int argc, char **argv)
 
																 #if 1
															
 
																 		unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
															
 
																-		unsigned bz, iter;
															
 
																-		unsigned last;
															
 
																+		int iter;
															
 
																+		unsigned last, bz;
															
 
																 		for (iter = 0; iter < who_runs_what_len; iter++)
															
 
																 		{
															
 
																 			last = 1;
															
--- a/examples/stencil/stencil.h
+++ b/examples/stencil/stencil.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -56,8 +56,8 @@ typedef enum
 
																 struct block_description
															
 
																 {
															
 
																 	/* Which MPI node should process that block ? */
															
 
																-	unsigned mpi_node;
															
 
																-	
															
 
																+	int mpi_node;
															
 
																+
															
 
																 	unsigned preferred_worker;
															
 
																 	unsigned bz;
															
@@ -101,7 +101,7 @@ void check(int rank);
 
																 void display_memory_consumption(int rank);
															
 
																-unsigned get_block_mpi_node(int z);
															
 
																+int get_block_mpi_node(int z);
															
 
																 unsigned get_block_size(int z);
															
 
																 unsigned get_bind_tasks(void);
															
@@ -111,8 +111,8 @@ unsigned get_ticks(void);
 
																 unsigned global_workerid(unsigned local_workerid);
															
 
																-void create_task_update(unsigned iter, unsigned z, unsigned local_rank);
															
 
																-void create_task_save(unsigned iter, unsigned z, int dir, unsigned local_rank);
															
 
																+void create_task_update(unsigned iter, unsigned z, int local_rank);
															
 
																+void create_task_save(unsigned iter, unsigned z, int dir, int local_rank);
															
 
																 extern int starpu_mpi_initialize(void);
															
 
																 extern int starpu_mpi_shutdown(void);
															
--- a/examples/tag_example/tag_example2.c
+++ b/examples/tag_example/tag_example2.c
@@ -76,7 +76,7 @@ static void tag_cleanup_grid(unsigned ni, unsigned iter)
 
																 static int create_task_grid(unsigned iter)
															
 
																 {
															
 
																-	int i;
															
 
																+	unsigned i;
															
 
																 	int ret;
															
 
																 /*	FPRINTF(stderr, "start iter %d ni %d...\n", iter, ni); */
															
--- a/include/starpu_cuda.h
+++ b/include/starpu_cuda.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -39,7 +39,6 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 
																 #define STARPU_CUDA_REPORT_ERROR(status) \
															
 
																 	starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
															
 
																-size_t starpu_cuda_get_global_mem_size(unsigned devid);
															
 
																 cudaStream_t starpu_cuda_get_local_stream(void);
															
 
																 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
															
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -85,7 +85,7 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, e
 
																 void starpu_data_release(starpu_data_handle_t handle);
															
 
																 void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
															
 
																-void starpu_malloc_set_align(size_t);
															
 
																+void starpu_malloc_set_align(size_t align);
															
 
																 int starpu_malloc(void **A, size_t dim);
															
 
																 int starpu_free(void *A);
															
 
																 void starpu_memory_display_stats();
															
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -73,10 +73,15 @@ struct starpu_data_copy_methods
 
																 	int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
															
 
																 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
															
 
																 #endif
															
 
																+
															
 
																+	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
 
																 };
															
 
																+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
															
 
																+
															
 
																 enum starpu_data_interface_id
															
 
																 {
															
 
																+	STARPU_UNKNOWN_INTERFACE_ID = -1,
															
 
																 	STARPU_MATRIX_INTERFACE_ID=0,
															
 
																 	STARPU_BLOCK_INTERFACE_ID=1,
															
 
																 	STARPU_VECTOR_INTERFACE_ID=2,
															
@@ -99,7 +104,7 @@ struct starpu_data_interface_ops
 
																 	/* Free data of the interface on a given node. */
															
 
																 	void (*free_data_on_node)(void *data_interface, unsigned node);
															
 
																 	/* ram/cuda/opencl synchronous and asynchronous transfer methods */
															
 
																-	struct starpu_data_copy_methods *copy_methods;
															
 
																+	const struct starpu_data_copy_methods *copy_methods;
															
 
																 	/* Return the current pointer (if any) for the handle on the given node. */
															
 
																 	void * (*handle_to_pointer)(starpu_data_handle_t handle, unsigned node);
															
 
																 	/* Return an estimation of the size of data, for performance models */
															
--- a/include/starpu_opencl.h
+++ b/include/starpu_opencl.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -53,7 +53,6 @@ struct starpu_opencl_program
 
																 	cl_program programs[STARPU_MAXOPENCLDEVS];
															
 
																 };
															
 
																-size_t starpu_opencl_get_global_mem_size(int devid);
															
 
																 void starpu_opencl_get_context(int devid, cl_context *context);
															
 
																 void starpu_opencl_get_device(int devid, cl_device_id *device);
															
 
																 void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
															
@@ -108,7 +107,7 @@ cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *
 
																 cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node, size_t src_offset, cl_mem dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event, int *ret);
															
 
																-cl_int starpu_opencl_copy_async_sync(uintptr_t src, unsigned src_node, size_t src_offset, uintptr_t dst, unsigned dst_node, size_t dst_offset, size_t size, cl_event *event);
															
 
																+cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, cl_event *event);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
@@ -102,6 +102,8 @@ struct starpu_perfmodel_history_entry
 
																 #else
															
 
																 	size_t size; /* in bytes */
															
 
																 #endif
															
 
																+
															
 
																+	double flops; /* Provided by the application */
															
 
																 };
															
 
																 struct starpu_perfmodel_history_list
															
@@ -212,6 +214,10 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
																 void starpu_bus_print_bandwidth(FILE *f);
															
 
																 void starpu_bus_print_affinity(FILE *f);
															
 
																+/* use bw & latency to compute the velocity of resources*/
															
 
																+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
															
 
																+double starpu_get_latency_RAM_CUDA(unsigned cudadev);
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -24,12 +24,8 @@ extern "C"
 
																 {
															
 
																 #endif
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#  warning rename all objects to start with starpu_sched_ctx
															
 
																-#endif
															
 
																-
															
 
																-//struct starpu_iterator;
															
 
																-struct starpu_iterator
															
 
																+//struct starpu_sched_ctx_iterator;
															
 
																+struct starpu_sched_ctx_iterator
															
 
																 {
															
 
																 	int cursor;
															
 
																 };
															
@@ -42,12 +38,12 @@ struct starpu_sched_ctx_worker_collection
 
																 	void *workerids;
															
 
																 	/* the number of workers in the collection */
															
 
																 	unsigned nworkers;
															
 
																-	/* the type of structure (STARPU_WORKER_LIST,...) */
															
 
																+	/* the type of structure (STARPU_SCHED_CTX_WORKER_LIST,...) */
															
 
																 	int type;
															
 
																 	/* checks if there is another element in collection */
															
 
																-	unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
															
 
																+	unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
															
 
																 	/* return the next element in the collection */
															
 
																-	int (*get_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
															
 
																+	int (*get_next)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
															
 
																 	/* add a new element in the collection */
															
 
																 	int (*add)(struct starpu_sched_ctx_worker_collection *workers, int worker);
															
 
																 	/* remove an element from the collection */
															
@@ -57,26 +53,26 @@ struct starpu_sched_ctx_worker_collection
 
																 	/* free the structure */
															
 
																 	void (*deinit)(struct starpu_sched_ctx_worker_collection *workers);
															
 
																 	/* initialize the cursor if there is one */
															
 
																-	void (*init_iterator)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_iterator *it);
															
 
																+	void (*init_iterator)(struct starpu_sched_ctx_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
															
 
																 };
															
 
																 /* types of structures the worker collection can implement */
															
 
																-#define STARPU_WORKER_LIST 0
															
 
																+#define STARPU_SCHED_CTX_WORKER_LIST 0
															
 
																-struct starpu_performance_counters
															
 
																+struct starpu_sched_ctx_performance_counters
															
 
																 {
															
 
																 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
															
 
																 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
															
 
																 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
															
 
																-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops);
															
 
																+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops, size_t data_size);
															
 
																 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
															
 
																 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
															
 
																 };
															
 
																 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																-void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters);
															
 
																-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops);
															
 
																-void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
															
 
																+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
															
 
																+void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size);
															
 
																+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
															
 
																 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																 unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
															
@@ -102,16 +98,16 @@ void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
 
																 struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
															
 
																 #if !defined(_MSC_VER) && !defined(STARPU_SIMGRID)
															
 
																-pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
															
 
																+pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
															
 
																 #endif
															
 
																-void starpu_task_set_context(unsigned *sched_ctx_id);
															
 
																+void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
															
 
																-unsigned starpu_task_get_context(void);
															
 
																+unsigned starpu_sched_ctx_get_context(void);
															
 
																-void starpu_notify_hypervisor_exists(void);
															
 
																+void starpu_sched_ctx_notify_hypervisor_exists(void);
															
 
																-unsigned starpu_check_if_hypervisor_exists(void);
															
 
																+unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
															
 
																 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
															
@@ -121,13 +117,13 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id);
 
																 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
															
 
																-unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
															
 
																+unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
															
 
																-void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
															
 
																+void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
															
 
																-double starpu_get_max_time_worker_on_ctx(void);
															
 
																+double starpu_sched_ctx_get_max_time_worker_on_ctx(void);
															
 
																-void starpu_stop_task_submission(void);
															
 
																+void starpu_sched_ctx_stop_task_submission(void);
															
 
																 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -321,6 +321,9 @@ int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx_id);
 
																 /* This function waits until there is no more ready task. */
															
 
																 int starpu_task_wait_for_no_ready(void);
															
 
																+int starpu_task_nready(void);
															
 
																+int starpu_task_nsubmitted(void);
															
 
																+
															
 
																 void starpu_codelet_init(struct starpu_codelet *cl);
															
 
																 void starpu_display_codelet_stats(struct starpu_codelet *cl);
															
--- a/include/starpu_task_util.h
+++ b/include/starpu_task_util.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -45,7 +45,7 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 
																 #define STARPU_DATA_ARRAY       (1<<11) /* Array of data handles */
															
 
																 #define STARPU_TAG       (1<<12) /* Tag */
															
 
																 #define STARPU_HYPERVISOR_TAG	(1<<13)	/* Used to tag a task after whose execution we'll execute  a code */
															
 
																-#define STARPU_HYPERVISOR_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
															
 
																+#define STARPU_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
															
 
																 /* Wrapper to create a task. */
															
 
																 int starpu_insert_task(struct starpu_codelet *cl, ...);
															
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -207,17 +207,17 @@ static __inline int starpu_get_env_number(const char *str)
 
																 	if (strval)
															
 
																 	{
															
 
																 		/* the env variable was actually set */
															
 
																-		unsigned val;
															
 
																+		long int val;
															
 
																 		char *check;
															
 
																-		val = (int)strtol(strval, &check, 10);
															
 
																+		val = strtol(strval, &check, 10);
															
 
																 		if (*check) {
															
 
																 			fprintf(stderr,"The %s environment variable must contain an integer\n", str);
															
 
																 			STARPU_ABORT();
															
 
																 		}
															
 
																 		/* fprintf(stderr, "ENV %s WAS %d\n", str, val); */
															
 
																-		return val;
															
 
																+		return (int)val;
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -75,7 +75,7 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
																 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
															
 
																 endif
															
 
																-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
															
 
																+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
															
 
																 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
															
 
																 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
															
 
																 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
															
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -72,10 +72,9 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 
																 	struct timeval start;
															
 
																 	struct timeval end;
															
 
																 	starpu_data_handle_t **data_handles;
															
 
																-	int x, y;
															
 
																+	unsigned x,y,i,j,k;
															
 
																 	/* create all the DAG nodes */
															
 
																-	unsigned i,j,k;
															
 
																 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
															
 
																 	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
															
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -25,8 +25,8 @@ unsigned nblocks = 16;
 
																 unsigned nbigblocks = 2;
															
 
																 unsigned noprio = 0;
															
 
																 unsigned display = 0;
															
 
																-unsigned dblockx = -1;
															
 
																-unsigned dblocky = -1;
															
 
																+int dblockx = -1;
															
 
																+int dblocky = -1;
															
 
																 void parse_args(int argc, char **argv, int nodes)
															
 
																 {
															
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -32,8 +32,8 @@
 
																 static unsigned long size = 4096;
															
 
																 static unsigned nblocks = 16;
															
 
																 static unsigned check = 0;
															
 
																-static unsigned p = 1;
															
 
																-static unsigned q = 1;
															
 
																+static int p = 1;
															
 
																+static int q = 1;
															
 
																 static unsigned display = 0;
															
 
																 #ifdef STARPU_HAVE_LIBNUMA
															
--- a/mpi/examples/mpi_lu/pxlu.c
+++ b/mpi/examples/mpi_lu/pxlu.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -101,7 +101,7 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 
																 	int mpi_tag_array[world_size];
															
 
																 	starpu_data_handle_t handle_array[world_size];
															
 
																-	unsigned r;
															
 
																+	int r;
															
 
																 	for (r = 0; r < world_size; r++)
															
 
																 	{
															
 
																 		if (rank_mask[r]) {
															
--- a/mpi/src/Makefile.am
+++ b/mpi/src/Makefile.am
@@ -21,7 +21,7 @@ BUILT_SOURCES =
 
																 CLEANFILES = *.gcno *.gcda *.linkinfo
															
 
																-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
															
 
																+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
															
 
																 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
															
 
																 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
															
 
																 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -443,12 +443,12 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 
																 	_STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, _starpu_mpi_request_type(req->request_type), req->srcdst);
															
 
																 	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
															
 
																-	
															
 
																+
															
 
																 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
															
 
																 	STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																 	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
															
 
																-	
															
 
																+
															
 
																 	if (*testing_req->flag)
															
 
																 	{
															
 
																 		testing_req->ret = req->ret;
															
@@ -841,6 +841,17 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
															
 
																 	}
															
 
																+	{
															
 
																+	     int rank, worldsize;
															
 
																+	     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
															
 
																+	     TRACE_MPI_START(rank, worldsize);
															
 
																+#ifdef STARPU_USE_FXT
															
 
																+	     starpu_set_profiling_id(rank);
															
 
																+#endif //STARPU_USE_FXT
															
 
																+	}
															
 
																+
															
 
																+
															
 
																 	/* notify the main thread that the progression thread is ready */
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 	running = 1;
															
@@ -862,7 +873,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 			_STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
															
 
																 			TRACE_MPI_SLEEP_BEGIN();
															
 
																-			
															
 
																+
															
 
																 			if (barrier_running)
															
 
																 				/* Tell mpi_barrier */
															
 
																 				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
															
@@ -967,13 +978,6 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
																 	argc_argv->argc = argc;
															
 
																 	argc_argv->argv = argv;
															
 
																-	int rank, worldsize;
															
 
																-
															
 
																-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																-	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
															
 
																-
															
 
																-	TRACE_MPI_START(rank,worldsize);
															
 
																-
															
 
																 	_STARPU_PTHREAD_CREATE("MPI progress", &progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
@@ -981,10 +985,6 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
																 		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																-#ifdef STARPU_USE_FXT
															
 
																-	starpu_set_profiling_id(rank);
															
 
																-#endif //STARPU_USE_FXT
															
 
																-
															
 
																 #ifdef USE_STARPU_ACTIVITY
															
 
																 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
															
 
																 	STARPU_ASSERT(hookid >= 0);
															
@@ -1053,4 +1053,3 @@ int starpu_mpi_shutdown(void)
 
																 	return 0;
															
 
																 }
															
 
																-
															
--- a/mpi/src/starpu_mpi_stats.c
+++ b/mpi/src/starpu_mpi_stats.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -70,7 +70,7 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
 
																 void _starpu_mpi_comm_amounts_display(int node)
															
 
																 {
															
 
																-	unsigned dst;
															
 
																+	int dst;
															
 
																 	size_t sum = 0;
															
 
																 	if (stats_enabled == 0) return;
															
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -64,7 +64,7 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
																 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
															
 
																 endif
															
 
																-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Werror=implicit
															
 
																+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
															
 
																 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
															
 
																 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
															
 
																 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
															
--- a/mpi/tests/mpi_detached_tag.c
+++ b/mpi/tests/mpi_detached_tag.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
--- a/mpi/tests/mpi_irecv.c
+++ b/mpi/tests/mpi_irecv.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
--- a/mpi/tests/mpi_irecv_detached.c
+++ b/mpi/tests/mpi_irecv_detached.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -69,8 +69,8 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
--- a/mpi/tests/mpi_isend.c
+++ b/mpi/tests/mpi_isend.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -54,8 +54,8 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
--- a/mpi/tests/mpi_isend_detached.c
+++ b/mpi/tests/mpi_isend_detached.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -68,8 +68,8 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
--- a/mpi/tests/mpi_probe.c
+++ b/mpi/tests/mpi_probe.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -69,8 +69,8 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
--- a/mpi/tests/mpi_test.c
+++ b/mpi/tests/mpi_test.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -55,8 +55,8 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
--- a/mpi/tests/pingpong.c
+++ b/mpi/tests/pingpong.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -55,8 +55,8 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
--- a/mpi/tests/ring.c
+++ b/mpi/tests/ring.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -24,7 +24,7 @@
 
																 #  define NITER	2048
															
 
																 #endif
															
 
																-unsigned token = 42;
															
 
																+int token = 42;
															
 
																 starpu_data_handle_t token_handle;
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
																 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																 {
															
 
																-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	(*tokenptr)++;
															
 
																 }
															
@@ -82,13 +82,13 @@ int main(int argc, char **argv)
 
																 	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
															
 
																+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																-	unsigned last_loop = nloops - 1;
															
 
																-	unsigned last_rank = size - 1;
															
 
																+	int last_loop = nloops - 1;
															
 
																+	int last_rank = size - 1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
 
																 	{
															
--- a/mpi/tests/ring_async.c
+++ b/mpi/tests/ring_async.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -24,7 +24,7 @@
 
																 #  define NITER	2048
															
 
																 #endif
															
 
																-unsigned token = 42;
															
 
																+int token = 42;
															
 
																 starpu_data_handle_t token_handle;
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
																 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																 {
															
 
																-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	(*tokenptr)++;
															
 
																 }
															
@@ -82,13 +82,13 @@ int main(int argc, char **argv)
 
																 	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
															
 
																+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																-	unsigned last_loop = nloops - 1;
															
 
																-	unsigned last_rank = size - 1;
															
 
																+	int last_loop = nloops - 1;
															
 
																+	int last_rank = size - 1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
 
																 	{
															
--- a/mpi/tests/ring_async_implicit.c
+++ b/mpi/tests/ring_async_implicit.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -24,7 +24,7 @@
 
																 #  define NITER	2048
															
 
																 #endif
															
 
																-unsigned token = 42;
															
 
																+int token = 42;
															
 
																 starpu_data_handle_t token_handle;
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -33,7 +33,7 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
																 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																 {
															
 
																-	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	(*tokenptr)++;
															
 
																 }
															
@@ -80,13 +80,13 @@ int main(int argc, char **argv)
 
																 	}
															
 
																-	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
															
 
																+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
															
 
																-	unsigned nloops = NITER;
															
 
																-	unsigned loop;
															
 
																+	int nloops = NITER;
															
 
																+	int loop;
															
 
																-	unsigned last_loop = nloops - 1;
															
 
																-	unsigned last_rank = size - 1;
															
 
																+	int last_loop = nloops - 1;
															
 
																+	int last_rank = size - 1;
															
 
																 	for (loop = 0; loop < nloops; loop++)
															
 
																 	{
															
--- a/sched_ctx_hypervisor/examples/Makefile.am
+++ b/sched_ctx_hypervisor/examples/Makefile.am
@@ -13,9 +13,9 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
															
 
																+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
															
 
																 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la
															
 
																-AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include -I$(top_srcdir)/sched_ctx_hypervisor/examples
															
 
																 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS)
															
 
																 if !NO_BLAS_LIB
															
--- a/sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
+++ b/sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
@@ -48,7 +48,7 @@ int tag = 1;
 
																 void* start_thread(void *arg)
															
 
																 {
															
 
																 	unsigned sched_ctx = *((unsigned*)arg);
															
 
																-	starpu_task_set_context(&sched_ctx);
															
 
																+	starpu_sched_ctx_set_context(&sched_ctx);
															
 
																 	struct starpu_task *task[10];
															
 
																 	struct params params[10];
															
@@ -115,8 +115,8 @@ int main()
 
																 	policy.name = "app_driven";
															
 
																 	void *perf_counters = sched_ctx_hypervisor_init(&policy);
															
 
																-	starpu_set_perf_counters(sched_ctx1, (struct starpu_performance_counters*)perf_counters);
															
 
																-	starpu_set_perf_counters(sched_ctx2, (struct starpu_performance_counters*)perf_counters);
															
 
																+	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
															
 
																+	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
															
 
																 	sched_ctx_hypervisor_register_ctx(sched_ctx1, 0.0);
															
 
																 	sched_ctx_hypervisor_register_ctx(sched_ctx2, 0.0);
															
--- a/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
+++ b/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
@@ -97,7 +97,7 @@ void* start_bench(void *val)
 
																 	pthread_setspecific(key, &p->id);
															
 
																 	if(p->ctx != 0)
															
 
																-		starpu_task_set_context(&p->ctx);
															
 
																+		starpu_sched_ctx_set_context(&p->ctx);
															
 
																 	for(i = 0; i < NSAMPLES; i++)
															
 
																 		p->bench(p->mat[i], p->size, p->nblocks);
															
@@ -241,7 +241,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 
																 	struct sched_ctx_hypervisor_policy policy;
															
 
																 	policy.custom = 0;
															
 
																 	policy.name = "idle";
															
 
																-	struct starpu_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
															
 
																+	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
															
 
																 	int nworkers1 = cpu1 + gpu + gpu1;
															
 
																 	int nworkers2 = cpu2 + gpu + gpu2;
															
 
																 	unsigned n_all_gpus = gpu + gpu1 + gpu2;
															
@@ -267,7 +267,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 
																 		p1.workers[i] = i;
															
 
																 	p1.ctx = starpu_sched_ctx_create("heft", p1.workers, nworkers1, "sched_ctx1");
															
 
																-	starpu_set_perf_counters(p1.ctx, perf_counters);
															
 
																+	starpu_sched_ctx_set_perf_counters(p1.ctx, perf_counters);
															
 
																 	p2.the_other_ctx = (int)p1.ctx;
															
 
																 	p1.nworkers = nworkers1;
															
 
																 	sched_ctx_hypervisor_register_ctx(p1.ctx, 0.0);
															
@@ -303,7 +303,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 
																 	/* 	p2.workers[k++] = i; */
															
 
																 	p2.ctx = starpu_sched_ctx_create("heft", p2.workers, 0, "sched_ctx2");
															
 
																-	starpu_set_perf_counters(p2.ctx, perf_counters);
															
 
																+	starpu_sched_ctx_set_perf_counters(p2.ctx, perf_counters);
															
 
																 	p1.the_other_ctx = (int)p2.ctx;
															
 
																 	p2.nworkers = 0;
															
 
																 	sched_ctx_hypervisor_register_ctx(p2.ctx, 0.0);
															
--- a/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
+++ b/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
@@ -30,3 +30,4 @@ void end_contexts(void);
 
																 void start_2benchs(void (*bench)(float *mat, unsigned size, unsigned nblocks));
															
 
																 void start_1stbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
															
 
																 void start_2ndbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
															
 
																+void set_hypervisor_conf(int event, int task_tag);
															
--- a/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
+++ b/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
@@ -127,6 +127,12 @@ struct sched_ctx_hypervisor_wrapper
 
																 	/* number of flops executed since last resizing */
															
 
																 	double elapsed_flops[STARPU_NMAXWORKERS];
															
 
																+	/* data quantity executed on each worker in this ctx */
															
 
																+	size_t elapsed_data[STARPU_NMAXWORKERS];
															
 
																+
															
 
																+	/* nr of tasks executed on each worker in this ctx */
															
 
																+	int elapsed_tasks[STARPU_NMAXWORKERS];
															
 
																+
															
 
																 	/* the average speed of workers when they belonged to this context */
															
 
																 	double ref_velocity[STARPU_NMAXWORKERS];
															
@@ -168,7 +174,7 @@ struct sched_ctx_hypervisor_policy
 
																 	void (*end_ctx)(unsigned sched_ctx);
															
 
																 };
															
 
																-struct starpu_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
															
 
																+struct starpu_sched_ctx_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
															
 
																 void sched_ctx_hypervisor_shutdown(void);
															
--- a/sched_ctx_hypervisor/src/Makefile.am
+++ b/sched_ctx_hypervisor/src/Makefile.am
@@ -12,11 +12,9 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
															
 
																 LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
															
 
																-
															
 
																-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include/starpu/$(STARPU_EFFECTIVE_VERSION)/ -I$(top_builddir)/src/ -I$(top_srcdir)/src/ -I$(top_srcdir)/sched_ctx_hypervisor/include/
															
 
																-
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src/ -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include/ -I$(top_srcdir)/sched_ctx_hypervisor/src
															
 
																 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
															
 
																 lib_LTLIBRARIES = libsched_ctx_hypervisor.la
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
@@ -32,10 +32,12 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
																 	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
															
 
																 	int w,s;
															
 
																-	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
															
 
																+	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
															
 
																+	double total_flops = 0.0;
															
 
																 	for(s = 0; s < ns; s++)
															
 
																 	{
															
 
																+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
															
 
																 		for(w = 0; w < nw; w++)
															
 
																 		{
															
 
																 			w_in_s[s][w] = 0.0;
															
@@ -44,7 +46,6 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
																 			draft_flops_on_w[s][w] = 0.0;
															
 
																 			int worker = workers == NULL ? w : workers[w];
															
 
																-			sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
															
 
																 			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
															
 
																 			if(velocity[s][w] == -1.0)
															
 
																 			{
															
@@ -53,21 +54,20 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
																 				if(velocity[s][w] == -1.0)
															
 
																 					velocity[s][w] = sc_w->ref_velocity[worker];
															
 
																 				if(velocity[s][w] == -1.0)
															
 
																-					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
															
 
																+					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
															
 
																 			}
															
 
																-			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
															
 
																+//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
															
 
																 		}
															
 
																 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
															
 
																 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
															
 
																 	}
															
 
																-
															
 
																-
															
 
																+	
															
 
																 	/* take the exec time of the slowest ctx 
															
 
																 	   as starting point and then try to minimize it
															
 
																 	   as increasing it a little for the faster ctxs */
															
 
																 	double tmax = _get_slowest_ctx_exec_time();
															
 
																-	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax;
															
 
																+ 	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; 
															
 
																 //	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
															
 
																 	double res = 1.0;
															
@@ -413,8 +413,8 @@ static void ispeed_lp_end_ctx(unsigned sched_ctx)
 
																 {
															
 
																 	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
															
 
																 	int worker;
															
 
																-	for(worker = 0; worker < 12; worker++)
															
 
																-		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]);
															
 
																+/* 	for(worker = 0; worker < 12; worker++) */
															
 
																+/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
															
 
																 	return;
															
 
																 }
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
@@ -77,7 +77,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 
																 	int worker;
															
 
																 	int considered = 0;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
@@ -283,6 +283,12 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
																 		{
															
 
																 			tmp_nw_move[w] = 0;
															
 
																 			tmp_nw_add[w] = 0;
															
 
																+			int i;
															
 
																+			for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																+			{
															
 
																+				tmp_workers_move[w][i] = -1;
															
 
																+				tmp_workers_add[w][i] = -1;
															
 
																+			}
															
 
																 		}
															
 
																 		/* find workers that ctx s has to give away */
															
@@ -363,6 +369,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
																 				int nw_add = 0;
															
 
																 				int w;
															
 
																+				int j = 0, k = 0;
															
 
																 				for(w = 0; w < nw; w++)
															
 
																 				{
															
 
																 					enum starpu_archtype arch = STARPU_ANY_WORKER;
															
@@ -375,7 +382,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
																 					if( nw_needed > 0 && tmp_nw_move[w] > 0)
															
 
																 					{
															
 
																 						nw_move += nw_needed >= tmp_nw_move[w] ? tmp_nw_move[w] : nw_needed;
															
 
																-						int i = 0, j = 0;
															
 
																+						int i = 0;
															
 
																 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																 						{
															
 
																 							if(tmp_workers_move[w][i] != -1)
															
@@ -395,14 +402,14 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
																 					if(diff > 0.3 && tmp_nw_add[w] != 0)
															
 
																 					{
															
 
																 						nw_add = tmp_nw_add[w];
															
 
																-						int i = 0, j = 0;
															
 
																+						int i = 0;
															
 
																 						for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																 						{
															
 
																 							if(tmp_workers_add[w][i] != -1)
															
 
																 							{
															
 
																-								workers_add[j++] = tmp_workers_add[w][i];
															
 
																+								workers_add[k++] = tmp_workers_add[w][i];
															
 
																 								tmp_workers_add[w][i] = -1;
															
 
																-								if(j == nw_add)
															
 
																+								if(k == nw_add)
															
 
																 									break;
															
 
																 							}
															
 
																 						}
															
@@ -413,7 +420,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
																 				if(nw_move > 0)
															
 
																 				{
															
 
																-					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 1);
															
 
																+					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
															
 
																 					nw_move = 0;
															
 
																 				}
															
@@ -452,7 +459,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
																 		}
															
 
																 		if(nw_move > 0)
															
 
																-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 1);
															
 
																+			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
															
 
																 	}
															
 
																 }
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
@@ -28,7 +28,7 @@ static int _compute_priority(unsigned sched_ctx)
 
																 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
															
 
																 	int worker;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
@@ -113,7 +113,7 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 
																 	int worker;
															
 
																 	int considered = 0;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
@@ -169,7 +169,6 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 
																 		}
															
 
																 	}
															
 
																-
															
 
																 	return curr_workers;
															
 
																 }
															
@@ -181,7 +180,7 @@ unsigned _get_potential_nworkers(struct sched_ctx_hypervisor_policy_config *conf
 
																 	unsigned potential_workers = 0;
															
 
																 	int worker;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
 
																 	while(workers->has_next(workers, &it))
															
@@ -304,7 +303,7 @@ static double _get_best_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w,
 
																 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
															
 
																         int worker;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																                 workers->init_iterator(workers, &it);
															
@@ -330,7 +329,7 @@ static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_
 
																 	double avg = 0.0;
															
 
																 	int n = 0;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																                 workers->init_iterator(workers, &it);
															
@@ -356,7 +355,7 @@ static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
 
																 	int worker;
															
 
																 	double ispeed_sample = 0.0;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																                 workers->init_iterator(workers, &it);
															
@@ -401,7 +400,7 @@ double _get_slowest_ctx_exec_time(void)
 
																 	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
															
 
																 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
															
 
																-	double curr_time = starpu_timing_now();
															
 
																+/* 	double curr_time = starpu_timing_now(); */
															
 
																 	double slowest_time = 0.0;
															
 
																 	int s;
															
@@ -410,18 +409,13 @@ double _get_slowest_ctx_exec_time(void)
 
																 	{
															
 
																 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
															
 
																-/*                 double elapsed_time = curr_time - sc_w->start_time; */
															
 
																-/* 		if(elapsed_time > slowest_time) */
															
 
																-/* 			slowest_time = elapsed_time; */
															
 
																-
															
 
																-//		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
															
 
																+//		double elapsed_time  = (curr_time - sc_w->start_time)/1000000;
															
 
																 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
															
 
																-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
															
 
																+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
															
 
																 		if(elapsed_time > slowest_time)
															
 
																 			slowest_time = elapsed_time;
															
 
																         }
															
 
																-//	return slowest_time / 1000000.0;
															
 
																 	return slowest_time;
															
 
																 }
															
@@ -431,7 +425,7 @@ double _get_fastest_ctx_exec_time(void)
 
																 	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
															
 
																 	double curr_time = starpu_timing_now();
															
 
																-	double fastest_time = curr_time;
															
 
																+ 	double fastest_time = curr_time;
															
 
																 	int s;
															
 
																 	struct sched_ctx_hypervisor_wrapper* sc_w;		
															
@@ -440,13 +434,13 @@ double _get_fastest_ctx_exec_time(void)
 
																 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
															
 
																 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
															
 
																-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
															
 
																-
															
 
																+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
															
 
																+		
															
 
																 		if(elapsed_time < fastest_time)
															
 
																 			fastest_time = elapsed_time;
															
 
																         }
															
 
																-//	return fastest_time / 1000000.0;
															
 
																+
															
 
																 	return fastest_time;
															
 
																 }
															
@@ -457,6 +451,8 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
																 		return -1.0;
															
 
																         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
															
 
																+	size_t elapsed_data_used = sc_w->elapsed_data[worker];
															
 
																+	int elapsed_tasks = sc_w->elapsed_tasks[worker];
															
 
																 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
															
 
																 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
															
@@ -479,6 +475,17 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
																         {
															
 
																                 double curr_time = starpu_timing_now();
															
 
																                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
															
 
																+ 		enum starpu_archtype arch = starpu_worker_get_type(worker);
															
 
																+		if(arch == STARPU_CUDA_WORKER)
															
 
																+		{
															
 
																+			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
															
 
																+			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ;
															
 
																+			double latency = starpu_get_latency_RAM_CUDA(worker);
															
 
																+//			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks);
															
 
																+			elapsed_time += (elapsed_tasks * latency)/1000000;
															
 
																+//			printf("elapsed time after %lf \n", elapsed_time);
															
 
																+		}
															
 
																+			
															
 
																                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
															
 
																 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
															
 
																                 return vel;
															
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
@@ -19,11 +19,11 @@
 
																 #include <starpu_config.h>
															
 
																 unsigned imposed_resize = 0;
															
 
																-struct starpu_performance_counters* perf_counters = NULL;
															
 
																+struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
															
 
																 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
															
 
																 static void notify_pushed_task(unsigned sched_ctx, int worker);
															
 
																-static void notify_poped_task(unsigned sched_ctx, int worker, double flops);
															
 
																+static void notify_poped_task(unsigned sched_ctx, int worker, double flops, size_t data_size);
															
 
																 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
															
 
																 static void notify_idle_end(unsigned sched_ctx, int  worker);
															
 
																 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
															
@@ -125,7 +125,7 @@ static struct sched_ctx_hypervisor_policy *_select_hypervisor_policy(struct sche
 
																 /* initializez the performance counters that starpu will use to retrive hints for resizing */
															
 
																-struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
															
 
																+struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
															
 
																 {
															
 
																 	hypervisor.min_tasks = 0;
															
 
																 	hypervisor.nsched_ctxs = 0;
															
@@ -158,6 +158,8 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
																 			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
															
 
																 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
															
 
																 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
															
 
																+			hypervisor.sched_ctx_w[i].elapsed_data[j] = 0;
															
 
																+			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
															
 
																 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
															
 
																 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
															
 
																 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
															
@@ -167,7 +169,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
																 	struct sched_ctx_hypervisor_policy *selected_hypervisor_policy = _select_hypervisor_policy(hypervisor_policy);
															
 
																 	_load_hypervisor_policy(selected_hypervisor_policy);
															
 
																-	perf_counters = (struct starpu_performance_counters*)malloc(sizeof(struct starpu_performance_counters));
															
 
																+	perf_counters = (struct starpu_sched_ctx_performance_counters*)malloc(sizeof(struct starpu_sched_ctx_performance_counters));
															
 
																 	perf_counters->notify_idle_cycle = notify_idle_cycle;
															
 
																 	perf_counters->notify_pushed_task = notify_pushed_task;
															
 
																 	perf_counters->notify_poped_task = notify_poped_task;
															
@@ -175,7 +177,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
																 	perf_counters->notify_idle_end = notify_idle_end;
															
 
																 	perf_counters->notify_submitted_job = notify_submitted_job;
															
 
																-	starpu_notify_hypervisor_exists();
															
 
																+	starpu_sched_ctx_notify_hypervisor_exists();
															
 
																 	return perf_counters;
															
 
																 }
															
@@ -346,7 +348,7 @@ int sched_ctx_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archty
 
																 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
															
 
																 	int worker;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
@@ -364,7 +366,14 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 
																 {
															
 
																 	int i;
															
 
																 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																+	{
															
 
																 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
															
 
																+		if(val == 0)
															
 
																+		{
															
 
																+			hypervisor.sched_ctx_w[sched_ctx].elapsed_data[i] = 0;
															
 
																+			hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[i] = 0;
															
 
																+		}
															
 
																+	}
															
 
																 }
															
 
																 double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
															
@@ -396,7 +405,7 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 
																 	sender_sc_w->start_time = start_time;
															
 
																 	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
															
 
																 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
															
 
																-	
															
 
																+
															
 
																 	receiver_sc_w->start_time = start_time;
															
 
																 	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
															
 
																 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
															
@@ -410,19 +419,11 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
																 	{
															
 
																 		_print_current_time();
															
 
																 		int j;
															
 
																-		printf("resize ctx %d with", sender_sched_ctx);
															
 
																+		printf("resize ctx %d with %d workers", sender_sched_ctx, nworkers_to_move);
															
 
																 		for(j = 0; j < nworkers_to_move; j++)
															
 
																 			printf(" %d", workers_to_move[j]);
															
 
																 		printf("\n");
															
 
																-/* 		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int)); */
															
 
																-/* 		int ncpus; */
															
 
																-
															
 
																-/* 		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus); */
															
 
																-
															
 
																-/* //		if(ncpus != 0) */
															
 
																-/* //			starpu_sched_ctx_remove_workers(cpus, ncpus, sender_sched_ctx); */
															
 
																-
															
 
																 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
															
 
																 		if(now)
															
@@ -622,11 +623,11 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
																 			   whatever the application says */
															
 
																 			if(!((hypervisor.resize[sender_sched_ctx] == 0 || hypervisor.resize[receiver_sched_ctx] == 0) && imposed_resize))
															
 
																 			{
															
 
																-				int j;
															
 
																-				printf("remove after ack from ctx %d:", sender_sched_ctx);
															
 
																-				for(j = 0; j < nmoved_workers; j++)
															
 
																-					printf(" %d", moved_workers[j]);
															
 
																-				printf("\n");
															
 
																+/* 				int j; */
															
 
																+/* 				printf("remove after ack from ctx %d:", sender_sched_ctx); */
															
 
																+/* 				for(j = 0; j < nmoved_workers; j++) */
															
 
																+/* 					printf(" %d", moved_workers[j]); */
															
 
																+/* 				printf("\n"); */
															
 
																 				starpu_sched_ctx_remove_workers(moved_workers, nmoved_workers, sender_sched_ctx);
															
@@ -715,10 +716,12 @@ static void notify_pushed_task(unsigned sched_ctx, int worker)
 
																 }
															
 
																 /* notifies the hypervisor that a task was poped from the queue of the worker */
															
 
																-static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops)
															
 
																+static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops, size_t data_size)
															
 
																 {
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
															
 
																+	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
															
 
																+	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
															
--- a/socl/examples/Makefile.am
+++ b/socl/examples/Makefile.am
@@ -14,7 +14,7 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
															
 
																 LIBS = $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
															
 
																 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
															
--- a/socl/src/cl_enqueuendrangekernel.c
+++ b/socl/src/cl_enqueuendrangekernel.c
@@ -164,9 +164,6 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 
																       cl_event beforeEvent, afterEvent, totalEvent;
															
 
																       totalEvent = event_create();
															
 
																-      totalEvent->prof_start = _socl_nanotime();
															
 
																-      totalEvent->prof_submit = totalEvent->prof_start;
															
 
																-      totalEvent->prof_queued = totalEvent->prof_start;
															
 
																       gc_entity_store(&totalEvent->cq, cq);
															
 
																       command_marker cmd = command_marker_create();
															
@@ -197,7 +194,7 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 
																          /* Store perf */
															
 
																          cl_ulong start,end;
															
 
																          soclGetEventProfilingInfo(beforeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL);
															
 
																-         soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &end, NULL);
															
 
																+         soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
															
 
																          soclReleaseEvent(afterEvent);
															
 
																          kernel->split_perfs[iter] = end-start;
															
@@ -205,6 +202,12 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
 
																          pthread_mutex_unlock(&kernel->split_lock);
															
 
																          event_complete(totalEvent);
															
 
																+
															
 
																+         totalEvent->prof_start = start;
															
 
																+         totalEvent->prof_submit = start;
															
 
																+         totalEvent->prof_queued = start;
															
 
																+         totalEvent->prof_end = end;
															
 
																+
															
 
																          RETURN_EVENT(totalEvent,event);
															
 
																       } else {
															
 
																          soclReleaseEvent(totalEvent);
															
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -49,7 +49,7 @@ endif STARPU_HAVE_WINDOWS
 
																 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
															
 
																-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU -Werror=implicit
															
 
																+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU
															
 
																 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS)
															
 
																 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -144,6 +144,15 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
															
 
																+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+	int i;
															
 
																+	size_t data_size = 0;
															
 
																+	for(i = 0; i < STARPU_NMAXBUFS; i++)
															
 
																+		if(task->handles[i] != NULL)
															
 
																+			data_size += _starpu_data_get_size(task->handles[i]);
															
 
																+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																+
															
 
																 	/* We release handle reference count */
															
 
																 	if (task->cl)
															
 
																 	{
															
@@ -210,8 +219,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
																 	{
															
 
																 		_starpu_sched_post_exec_hook(task);
															
 
																 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																-		int workerid = starpu_worker_get_id();
															
 
																-		starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops);
															
 
																+		starpu_sched_ctx_call_poped_task_cb(workerid, task->sched_ctx, task->flops, data_size);
															
 
																 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																 	}
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1344,6 +1344,16 @@ static void write_bus_bandwidth_file_content(void)
 
																 }
															
 
																 #endif /* STARPU_SIMGRID */
															
 
																+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
															
 
																+{
															
 
																+	return bandwidth_matrix[0][cudadev+1];
															
 
																+}
															
 
																+
															
 
																+double starpu_get_latency_RAM_CUDA(unsigned cudadev)
															
 
																+{
															
 
																+	return latency_matrix[0][cudadev+1];
															
 
																+}
															
 
																+
															
 
																 void starpu_bus_print_bandwidth(FILE *f)
															
 
																 {
															
 
																 	unsigned src, dst, maxnode;
															
@@ -1397,14 +1407,14 @@ void starpu_bus_print_bandwidth(FILE *f)
 
																 	{
															
 
																 		struct dev_timing *timing;
															
 
																 		struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																-		int ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																-		int cpu;
															
 
																+		unsigned config_ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+		unsigned cpu;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		if (src <= ncuda)
															
 
																 		{
															
 
																 			fprintf(f, "CUDA %d\t", src-1);
															
 
																-			for (cpu = 0; cpu < ncpus; cpu++)
															
 
																+			for (cpu = 0; cpu < config_ncpus; cpu++)
															
 
																 			{
															
 
																 				timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
															
 
																 				if (timing->timing_htod)
															
@@ -1420,7 +1430,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 		{
															
 
																 			fprintf(f, "OpenCL%d\t", src-ncuda-1);
															
 
																-			for (cpu = 0; cpu < ncpus; cpu++)
															
 
																+			for (cpu = 0; cpu < config_ncpus; cpu++)
															
 
																 			{
															
 
																 				timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
															
 
																 				if (timing->timing_htod)
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -180,7 +180,7 @@ static void scan_reg_model(FILE *f, struct starpu_perfmodel_regression_model *re
 
																 static void dump_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
															
 
																 {
															
 
																-	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
															
 
																+	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
															
 
																 }
															
 
																 static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
															
@@ -192,28 +192,36 @@ static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *e
 
																 	/* In case entry is NULL, we just drop these values */
															
 
																 	unsigned nsample;
															
 
																 	uint32_t footprint;
															
 
																-#ifdef STARPU_HAVE_WINDOWS
															
 
																-	unsigned size; /* in bytes */
															
 
																-#else
															
 
																-	size_t size; /* in bytes */
															
 
																-#endif
															
 
																+	unsigned long size; /* in bytes */
															
 
																+	double flops;
															
 
																 	double mean;
															
 
																 	double deviation;
															
 
																 	double sum;
															
 
																 	double sum2;
															
 
																+	char line[256];
															
 
																+	char *ret;
															
 
																+
															
 
																+	ret = fgets(line, sizeof(line), f);
															
 
																+	STARPU_ASSERT(ret);
															
 
																+	STARPU_ASSERT(strchr(line, '\n'));
															
 
																+
															
 
																 	/* Read the values from the file */
															
 
																-	res = fscanf(f, "%x\t%"
															
 
																-#ifndef STARPU_HAVE_WINDOWS
															
 
																-	"z"
															
 
																-#endif
															
 
																-	"u\t%le\t%le\t%le\t%le\t%u\n", &footprint, &size, &mean, &deviation, &sum, &sum2, &nsample);
															
 
																-	STARPU_ASSERT_MSG(res == 7, "Incorrect performance model file");
															
 
																+	res = sscanf(line, "%x\t%lu\t%le\t%le\t%le\t%le\t%le\t%u", &footprint, &size, &flops, &mean, &deviation, &sum, &sum2, &nsample);
															
 
																+
															
 
																+	if (res != 8)
															
 
																+	{
															
 
																+		flops = 0.;
															
 
																+		/* Read the values from the file */
															
 
																+		res = sscanf(line, "%x\t%lu\t%le\t%le\t%le\t%le\t%u", &footprint, &size, &mean, &deviation, &sum, &sum2, &nsample);
															
 
																+		STARPU_ASSERT_MSG(res == 7, "Incorrect performance model file");
															
 
																+	}
															
 
																 	if (entry)
															
 
																 	{
															
 
																 		entry->footprint = footprint;
															
 
																 		entry->size = size;
															
 
																+		entry->flops = flops;
															
 
																 		entry->mean = mean;
															
 
																 		entry->deviation = deviation;
															
 
																 		entry->sum = sum;
															
@@ -393,7 +401,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 
																 	/* Dump the history into the model file in case it is necessary */
															
 
																 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
															
 
																 	{
															
 
																-		fprintf(f, "# hash\t\tsize\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
															
 
																+		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean\t\tdev\t\tsum\t\tsum2\t\tn\n");
															
 
																 		ptr = per_arch_model->list;
															
 
																 		while (ptr)
															
 
																 		{
															
@@ -956,7 +964,7 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 
																 			char *symbol2 = strdup(symbol);
															
 
																 			symbol2[dot-symbol] = '\0';
															
 
																 			int ret;
															
 
																-			fprintf(stderr,"note: loading history from %s instead of %s\n", symbol2, symbol);
															
 
																+			_STARPU_DISP("note: loading history from %s instead of %s\n", symbol2, symbol);
															
 
																 			ret = starpu_perfmodel_load_symbol(symbol2,model);
															
 
																 			free(symbol2);
															
 
																 			return ret;
															
@@ -1152,6 +1160,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
																 				entry->sum2 = measured*measured;
															
 
																 				entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
															
 
																+				entry->flops = j->task->flops;
															
 
																 				entry->footprint = key;
															
 
																 				entry->nsample = 1;
															
@@ -1168,6 +1177,14 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
																 				unsigned n = entry->nsample;
															
 
																 				entry->mean = entry->sum / n;
															
 
																 				entry->deviation = sqrt((entry->sum2 - (entry->sum*entry->sum)/n)/n);
															
 
																+				if (j->task->flops != 0.)
															
 
																+				{
															
 
																+					if (entry->flops == 0.)
															
 
																+						entry->flops = j->task->flops;
															
 
																+					else if (entry->flops != j->task->flops)
															
 
																+						/* Incoherent flops! forget about trying to record flops */
															
 
																+						entry->flops = NAN;
															
 
																+				}
															
 
																 			}
															
 
																 			STARPU_ASSERT(entry);
															
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011, 2013  Université de Bordeaux 1
															
 
																  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
@@ -38,8 +38,8 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
																 			if (!parameter)
															
 
																 			{
															
 
																 				/* There isn't a parameter that is explicitely requested, so we display all parameters */
															
 
																-				printf("%08x\t%-15lu\t%-15le\t%-15le\t%u\n", entry->footprint,
															
 
																-					(unsigned long) entry->size, entry->mean, entry->deviation, entry->nsample);
															
 
																+				printf("%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint,
															
 
																+					(unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->nsample);
															
 
																 			}
															
 
																 			else
															
 
																 			{
															
@@ -230,7 +230,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
																 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
															
 
																 		if (nmatched == 1)
															
 
																 		{
															
 
																-			unsigned archid = STARPU_CUDA_DEFAULT+ gpuid;
															
 
																+			int archid = STARPU_CUDA_DEFAULT+ gpuid;
															
 
																 			unsigned implid;
															
 
																 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
															
 
																 				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -40,9 +40,9 @@ static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_w
 
																 	unsigned worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
															
 
																 	/* the worker was planning to go away in another ctx but finally he changed his mind & 
															
 
																 	   he's staying */
															
 
																-	if(worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
															
 
																+	if (worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
															
 
																 	{
															
 
																-		unsigned worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
															
 
																+		worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
															
 
																 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 		/* add context to worker */
															
 
																 		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
															
@@ -124,7 +124,7 @@ static void _starpu_update_workers_without_ctx(int *workerids, int nworkers, int
 
																 	return;
															
 
																 }
															
 
																-void starpu_stop_task_submission()
															
 
																+void starpu_sched_ctx_stop_task_submission()
															
 
																 {
															
 
																 	_starpu_exclude_task_from_dag(&stop_submission_task);
															
 
																 	_starpu_task_submit_internally(&stop_submission_task);
															
@@ -442,7 +442,7 @@ unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids,
 
																 }
															
 
																 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																-void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters)
															
 
																+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	sched_ctx->perf_counters = perf_counters;
															
@@ -716,6 +716,8 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier);
															
 
																+/*when finished decrementing the tasks if the user signaled he will not submit tasks anymore
															
 
																+  we can move all its workers to the inheritor context */
															
 
																 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
															
 
																 	{
															
 
																 		_STARPU_PTHREAD_MUTEX_LOCK(&finished_submit_mutex);
															
@@ -723,17 +725,20 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
																 		{
															
 
																 			_STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
															
 
																-			_STARPU_PTHREAD_MUTEX_LOCK(&changing_ctx_mutex[sched_ctx->id]);
															
 
																-			int *workerids = NULL;
															
 
																-			unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
															
 
																-
															
 
																-			if(nworkers > 0)
															
 
																+			/* take care the context is not deleted or changed at the same time */
															
 
																+			_STARPU_PTHREAD_MUTEX_LOCK(&changing_ctx_mutex[sched_ctx_id]);
															
 
																+			if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
															
 
																 			{
															
 
																-				starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
															
 
																-				free(workerids);
															
 
																+				int *workerids = NULL;
															
 
																+				unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
															
 
																+				
															
 
																+				if(nworkers > 0)
															
 
																+				{
															
 
																+					starpu_sched_ctx_add_workers(workerids, nworkers, sched_ctx->inheritor);
															
 
																+					free(workerids);
															
 
																+				}
															
 
																 			}
															
 
																-
															
 
																-			_STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx->id]);
															
 
																+			_STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
															
 
																 			return;
															
 
																 		}
															
@@ -748,12 +753,12 @@ void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
																 	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
															
 
																 }
															
 
																-void starpu_task_set_context(unsigned *sched_ctx)
															
 
																+void starpu_sched_ctx_set_context(unsigned *sched_ctx)
															
 
																 {
															
 
																 	pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
															
 
																 }
															
 
																-unsigned starpu_task_get_context()
															
 
																+unsigned starpu_sched_ctx_get_context()
															
 
																 {
															
 
																 	unsigned *sched_ctx = (unsigned*)pthread_getspecific(sched_ctx_key);
															
 
																 	if(sched_ctx == NULL)
															
@@ -762,12 +767,12 @@ unsigned starpu_task_get_context()
 
																 	return *sched_ctx;
															
 
																 }
															
 
																-void starpu_notify_hypervisor_exists()
															
 
																+void starpu_sched_ctx_notify_hypervisor_exists()
															
 
																 {
															
 
																 	with_hypervisor = 1;
															
 
																 }
															
 
																-unsigned starpu_check_if_hypervisor_exists()
															
 
																+unsigned starpu_sched_ctx_check_if_hypervisor_exists()
															
 
																 {
															
 
																 	return with_hypervisor;
															
 
																 }
															
@@ -797,7 +802,7 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collec
 
																 	switch(worker_collection_type)
															
 
																 	{
															
 
																-	case STARPU_WORKER_LIST:
															
 
																+	case STARPU_SCHED_CTX_WORKER_LIST:
															
 
																 		sched_ctx->workers->has_next = worker_list.has_next;
															
 
																 		sched_ctx->workers->get_next = worker_list.get_next;
															
 
																 		sched_ctx->workers->add = worker_list.add;
															
@@ -805,7 +810,7 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collec
 
																 		sched_ctx->workers->init = worker_list.init;
															
 
																 		sched_ctx->workers->deinit = worker_list.deinit;
															
 
																 		sched_ctx->workers->init_iterator = worker_list.init_iterator;
															
 
																-		sched_ctx->workers->type = STARPU_WORKER_LIST;
															
 
																+		sched_ctx->workers->type = STARPU_SCHED_CTX_WORKER_LIST;
															
 
																 		break;
															
 
																 	}
															
@@ -818,7 +823,7 @@ static unsigned _get_workers_list(struct _starpu_sched_ctx *sched_ctx, int **wor
 
																 	*workerids = (int*)malloc(workers->nworkers*sizeof(int));
															
 
																 	int worker;
															
 
																 	unsigned nworkers = 0;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
@@ -851,7 +856,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 
																 	int worker;
															
 
																 	int npus = 0;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
@@ -866,7 +871,7 @@ int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu
 
																 	return npus;
															
 
																 }
															
 
																-_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id)
															
 
																+_starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id)
															
 
																 {
															
 
																 	return &changing_ctx_mutex[sched_ctx_id];
															
 
																 }
															
@@ -891,7 +896,7 @@ unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sc
 
																         int worker, worker2;
															
 
																         int shared_workers = 0;
															
 
																-	struct starpu_iterator it1, it2;
															
 
																+	struct starpu_sched_ctx_iterator it1, it2;
															
 
																         if(workers->init_iterator)
															
 
																                 workers->init_iterator(workers, &it1);
															
@@ -926,7 +931,7 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id)
 
																         struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
															
 
																         int worker;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																         if(workers->init_iterator)
															
 
																                 workers->init_iterator(workers, &it);
															
@@ -963,7 +968,7 @@ unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 
																 	return worker->nsched_ctxs > 1;
															
 
																 }
															
 
																-unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
															
 
																+unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
															
 
																 {
															
 
																 	if(max_time_worker_on_ctx == -1.0) return 1;
															
@@ -971,7 +976,7 @@ unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 
																 	return worker->active_ctx == sched_ctx_id;
															
 
																 }
															
 
																-void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
															
 
																+void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
															
 
																 {
															
 
																 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
@@ -996,7 +1001,7 @@ void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 
																 	}
															
 
																 }
															
 
																-double starpu_get_max_time_worker_on_ctx(void)
															
 
																+double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
															
 
																 {
															
 
																 	return max_time_worker_on_ctx;
															
 
																 }
															
@@ -1020,15 +1025,15 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
																 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops)
															
 
																+void starpu_sched_ctx_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
															
 
																 	   && sched_ctx->perf_counters != NULL)
															
 
																-		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops);
															
 
																+		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops, data_size);
															
 
																 }
															
 
																-void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
															
 
																+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -91,7 +91,7 @@ struct _starpu_sched_ctx
 
																 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																 	/* a structure containing a series of performance counters determining the resize procedure */
															
 
																-	struct starpu_performance_counters *perf_counters;
															
 
																+	struct starpu_sched_ctx_performance_counters *perf_counters;
															
 
																 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																 };
															
@@ -139,7 +139,7 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
																 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
															
 
																 #if defined(_MSC_VER) || defined(STARPU_SIMGRID)
															
 
																-_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
															
 
																+_starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
															
 
																 #endif
															
 
																 #endif // __SCHED_CONTEXT_H__
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -225,7 +225,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
																 	}
															
 
																 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																-	starpu_call_pushed_task_cb(workerid, task->sched_ctx);
															
 
																+	starpu_sched_ctx_call_pushed_task_cb(workerid, task->sched_ctx);
															
 
																 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																 	if (is_basic_worker)
															
@@ -233,7 +233,6 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
																 		unsigned node = starpu_worker_get_memory_node(workerid);
															
 
																 		if (_starpu_task_uses_multiformat_handles(task))
															
 
																 		{
															
 
																-			unsigned i;
															
 
																 			for (i = 0; i < task->cl->nbuffers; i++)
															
 
																 			{
															
 
																 				struct starpu_task *conversion_task;
															
@@ -269,24 +268,24 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
																 		int ret = 0;
															
 
																-		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
															
 
																-		j->task_size = worker_size;
															
 
																-		j->combined_workerid = workerid;
															
 
																-		j->active_task_alias_count = 0;
															
 
																+		struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
															
 
																+		job->task_size = worker_size;
															
 
																+		job->combined_workerid = workerid;
															
 
																+		job->active_task_alias_count = 0;
															
 
																-		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
															
 
																-		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
															
 
																+		_STARPU_PTHREAD_BARRIER_INIT(&job->before_work_barrier, NULL, worker_size);
															
 
																+		_STARPU_PTHREAD_BARRIER_INIT(&job->after_work_barrier, NULL, worker_size);
															
 
																 		/* Note: we have to call that early, or else the task may have
															
 
																 		 * disappeared already */
															
 
																 		_starpu_push_task_end(task);
															
 
																-		int i;
															
 
																-		for (i = 0; i < worker_size; i++)
															
 
																+		int j;
															
 
																+		for (j = 0; j < worker_size; j++)
															
 
																 		{
															
 
																 			struct starpu_task *alias = _starpu_create_task_alias(task);
															
 
																-			worker = _starpu_get_worker_struct(combined_workerid[i]);
															
 
																+			worker = _starpu_get_worker_struct(combined_workerid[j]);
															
 
																 			ret |= _starpu_push_local_task(worker, alias, 0);
															
 
																 		}
															
@@ -299,14 +298,14 @@ static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struc
 
																 	int worker = -1, nworkers = 0;
															
 
																 	struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
															
 
																-	struct starpu_iterator it;
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
 
																 	while(workers->has_next(workers, &it))
															
 
																 	{
															
 
																 		worker = workers->get_next(workers, &it);
															
 
																-		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_is_ctxs_turn(worker, sched_ctx->id))
															
 
																+		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_sched_ctx_is_ctxs_turn(worker, sched_ctx->id))
															
 
																 			nworkers++;
															
 
																 	}
															
@@ -563,7 +562,7 @@ pick:
 
																 	{
															
 
																 		struct _starpu_sched_ctx *sched_ctx;
															
 
																-		unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
															
 
																+		//unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
															
 
																 		int been_here[STARPU_NMAX_SCHED_CTXS];
															
 
																 		int i;
															
@@ -582,7 +581,7 @@ pick:
 
																 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
															
 
																 				{
															
 
																 					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
															
 
																-					lucky_ctx = sched_ctx->id;
															
 
																+					//lucky_ctx = sched_ctx->id;
															
 
																 				}
															
 
																 			}
															
@@ -605,7 +604,7 @@ pick:
 
																 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																 	struct _starpu_sched_ctx *sched_ctx = NULL;
															
 
																-	struct starpu_performance_counters *perf_counters = NULL;
															
 
																+	struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
															
 
																 	int j;
															
 
																 	for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
															
 
																 	{
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -365,7 +365,7 @@ int starpu_task_submit(struct starpu_task *task)
 
																 	if (task->sched_ctx == 0 && nsched_ctxs != 1 && !j->exclude_from_dag)
															
 
																 	{
															
 
																-		set_sched_ctx = starpu_task_get_context();
															
 
																+		set_sched_ctx = starpu_sched_ctx_get_context();
															
 
																 		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																 			task->sched_ctx = set_sched_ctx;
															
 
																 	}
															
@@ -633,7 +633,7 @@ void starpu_display_codelet_stats(struct starpu_codelet *cl)
 
																 int starpu_task_wait_for_all(void)
															
 
																 {
															
 
																 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
															
 
																-	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_task_get_context();
															
 
																+	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_context();
															
 
																 	/* if there is no indication about which context to wait,
															
 
																 	   we wait for all tasks submitted to starpu */
															
@@ -745,6 +745,11 @@ static void _starpu_increment_nsubmitted_tasks(void)
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
															
 
																 }
															
 
																+int starpu_task_nsubmitted(void)
															
 
																+{
															
 
																+	return nsubmitted;
															
 
																+}
															
 
																+
															
 
																 void _starpu_increment_nready_tasks(void)
															
 
																 {
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
															
@@ -765,6 +770,11 @@ void _starpu_decrement_nready_tasks(void)
 
																 }
															
 
																+int starpu_task_nready(void)
															
 
																+{
															
 
																+	return nready;
															
 
																+}
															
 
																+
															
 
																 void _starpu_initialize_current_task_key(void)
															
 
																 {
															
 
																 	_STARPU_PTHREAD_KEY_CREATE(&current_task_key, NULL);
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -84,15 +84,15 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 
																 			switch (arch)
															
 
																 			{
															
 
																 			case STARPU_CPU_WORKER:
															
 
																-				if (task->cl->cpu_funcs[i] != NULL)
															
 
																+				if (task->cl->cpu_funcs[impl] != NULL)
															
 
																 					test_implementation = 1;
															
 
																 				break;
															
 
																 			case STARPU_CUDA_WORKER:
															
 
																-				if (task->cl->cuda_funcs[i] != NULL)
															
 
																+				if (task->cl->cuda_funcs[impl] != NULL)
															
 
																 					test_implementation = 1;
															
 
																 				break;
															
 
																 			case STARPU_OPENCL_WORKER:
															
 
																-				if (task->cl->opencl_funcs[i] != NULL)
															
 
																+				if (task->cl->opencl_funcs[impl] != NULL)
															
 
																 					test_implementation = 1;
															
 
																 				break;
															
 
																 			default:
															
@@ -340,14 +340,14 @@ void _starpu_worker_init(struct _starpu_worker *worker, unsigned fut_key)
 
																 }
															
 
																-static void _starpu_launch_drivers(struct _starpu_machine_config *config)
															
 
																+static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
															
 
																 {
															
 
																-	config->running = 1;
															
 
																-	config->submitting = 1;
															
 
																+	pconfig->running = 1;
															
 
																+	pconfig->submitting = 1;
															
 
																 	_STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
															
 
																-	unsigned nworkers = config->topology.nworkers;
															
 
																+	unsigned nworkers = pconfig->topology.nworkers;
															
 
																 	/* Launch workers asynchronously */
															
 
																 	unsigned cpu = 0, cuda = 0;
															
@@ -368,9 +368,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																-		struct _starpu_worker *workerarg = &config->workers[worker];
															
 
																+		struct _starpu_worker *workerarg = &pconfig->workers[worker];
															
 
																-		workerarg->config = config;
															
 
																+		workerarg->config = pconfig;
															
 
																 		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
															
@@ -388,7 +388,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 		workerarg->run_by_starpu = 1;
															
 
																 		workerarg->worker_is_running = 0;
															
 
																 		workerarg->worker_is_initialized = 0;
															
 
																-		
															
 
																+
															
 
																 		int ctx;
															
 
																 		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
															
 
																 			workerarg->removed_from_ctx[ctx] = 0;
															
@@ -419,7 +419,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 			case STARPU_CPU_WORKER:
															
 
																 				workerarg->set = NULL;
															
 
																 				driver.id.cpu_id = cpu;
															
 
																-				if (_starpu_may_launch_driver(config->conf, &driver))
															
 
																+				if (_starpu_may_launch_driver(pconfig->conf, &driver))
															
 
																 				{
															
 
																 					_STARPU_PTHREAD_CREATE_ON(
															
 
																 						workerarg->name,
															
@@ -446,7 +446,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 			case STARPU_CUDA_WORKER:
															
 
																 				workerarg->set = NULL;
															
 
																 				driver.id.cuda_id = cuda;
															
 
																-				if (_starpu_may_launch_driver(config->conf, &driver))
															
 
																+				if (_starpu_may_launch_driver(pconfig->conf, &driver))
															
 
																 				{
															
 
																 					_STARPU_PTHREAD_CREATE_ON(
															
 
																 						workerarg->name,
															
@@ -473,7 +473,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 			case STARPU_OPENCL_WORKER:
															
 
																 #ifndef STARPU_SIMGRID
															
 
																 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
															
 
																-				if (!_starpu_may_launch_driver(config->conf, &driver))
															
 
																+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
															
 
																 				{
															
 
																 					workerarg->run_by_starpu = 0;
															
 
																 					break;
															
@@ -504,7 +504,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 	cuda = 0;
															
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																-		struct _starpu_worker *workerarg = &config->workers[worker];
															
 
																+		struct _starpu_worker *workerarg = &pconfig->workers[worker];
															
 
																 		struct starpu_driver driver;
															
 
																 		driver.type = workerarg->arch;
															
@@ -512,7 +512,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 		{
															
 
																 			case STARPU_CPU_WORKER:
															
 
																 				driver.id.cpu_id = cpu;
															
 
																-				if (!_starpu_may_launch_driver(config->conf, &driver))
															
 
																+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
															
 
																 				{
															
 
																 					cpu++;
															
 
																 					break;
															
@@ -526,7 +526,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 				break;
															
 
																 			case STARPU_CUDA_WORKER:
															
 
																 				driver.id.cuda_id = cuda;
															
 
																-				if (!_starpu_may_launch_driver(config->conf, &driver))
															
 
																+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
															
 
																 				{
															
 
																 					cuda++;
															
 
																 					break;
															
@@ -542,7 +542,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
																 			case STARPU_OPENCL_WORKER:
															
 
																 #ifndef STARPU_SIMGRID
															
 
																 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
															
 
																-				if (!_starpu_may_launch_driver(config->conf, &driver))
															
 
																+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
															
 
																 					break;
															
 
																 #endif
															
 
																 				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
															
@@ -817,19 +817,19 @@ void starpu_profiling_init()
 
																  * Handle runtime termination
															
 
																  */
															
 
																-static void _starpu_terminate_workers(struct _starpu_machine_config *config)
															
 
																+static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
															
 
																 {
															
 
																 	int status STARPU_ATTRIBUTE_UNUSED;
															
 
																 	unsigned workerid;
															
 
																-	for (workerid = 0; workerid < config->topology.nworkers; workerid++)
															
 
																+	for (workerid = 0; workerid < pconfig->topology.nworkers; workerid++)
															
 
																 	{
															
 
																 		starpu_wake_all_blocked_workers();
															
 
																 		_STARPU_DEBUG("wait for worker %u\n", workerid);
															
 
																-		struct _starpu_worker_set *set = config->workers[workerid].set;
															
 
																-		struct _starpu_worker *worker = &config->workers[workerid];
															
 
																+		struct _starpu_worker_set *set = pconfig->workers[workerid].set;
															
 
																+		struct _starpu_worker *worker = &pconfig->workers[workerid];
															
 
																 		/* in case StarPU termination code is called from a callback,
															
 
																  		 * we have to check if pthread_self() is the worker itself */
															
@@ -914,10 +914,10 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 
																 #endif
															
 
																 }
															
 
																-static void _starpu_kill_all_workers(struct _starpu_machine_config *config)
															
 
																+static void _starpu_kill_all_workers(struct _starpu_machine_config *pconfig)
															
 
																 {
															
 
																 	/* set the flag which will tell workers to stop */
															
 
																-	config->running = 0;
															
 
																+	pconfig->running = 0;
															
 
																 	/* running is just protected by a memory barrier */
															
 
																 	STARPU_WMB();
															
 
																 	starpu_wake_all_blocked_workers();
															
@@ -1299,7 +1299,7 @@ int starpu_worker_get_nids_ctx_free_by_type(enum starpu_archtype type, int *work
 
																 				if(config.sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
															
 
																 				{
															
 
																 					struct starpu_sched_ctx_worker_collection *workers = config.sched_ctxs[s].workers;
															
 
																-					struct starpu_iterator it;
															
 
																+					struct starpu_sched_ctx_iterator it;
															
 
																 					if(workers->init_iterator)
															
 
																 						workers->init_iterator(workers, &it);
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -458,6 +458,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 			}
															
 
																 		}
															
 
																 		else
															
 
																+			/* The last request will perform the callback after termination */
															
 
																 			_starpu_data_request_append_callback(r, callback_func, callback_arg);
															
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -134,8 +134,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 	{
															
 
																 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
															
 
																 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
															
 
																-		STARPU_ASSERT(copy_methods->ram_to_ram);
															
 
																-		copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
															
 
																+		if (copy_methods->ram_to_ram)
															
 
																+			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
															
 
																+		else
															
 
																+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
															
 
																 		break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
															
@@ -143,11 +145,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 #if !defined(HAVE_CUDA_MEMCPY_PEER)
															
 
																 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
															
 
																 #endif
															
 
																-		STARPU_ASSERT(copy_methods->cuda_to_ram);
															
 
																-		if (!req || !copy_methods->cuda_to_ram_async)
															
 
																+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
															
 
																+				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
															
 
																 		{
															
 
																 			/* this is not associated to a request so it's synchronous */
															
 
																-			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
															
 
																+			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
															
 
																+			if (copy_methods->cuda_to_ram)
															
 
																+				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
															
 
																+			else
															
 
																+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
@@ -156,7 +162,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 			stream = starpu_cuda_get_local_out_transfer_stream();
															
 
																-			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																+			if (copy_methods->cuda_to_ram_async)
															
 
																+				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																+			else
															
 
																+			{
															
 
																+				STARPU_ASSERT(copy_methods->any_to_any);
															
 
																+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
															
 
																+			}
															
 
																 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
															
 
																 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
															
@@ -168,11 +180,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 #if !defined(HAVE_CUDA_MEMCPY_PEER)
															
 
																 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
															
 
																 #endif
															
 
																-		STARPU_ASSERT(copy_methods->ram_to_cuda);
															
 
																-		if (!req || !copy_methods->ram_to_cuda_async)
															
 
																+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
															
 
																+				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
															
 
																 		{
															
 
																 			/* this is not associated to a request so it's synchronous */
															
 
																-			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
															
 
																+			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
															
 
																+			if (copy_methods->ram_to_cuda)
															
 
																+				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
															
 
																+			else
															
 
																+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
@@ -182,7 +198,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 				STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 			stream = starpu_cuda_get_local_in_transfer_stream();
															
 
																-			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																+			if (copy_methods->ram_to_cuda_async)
															
 
																+				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																+			else
															
 
																+			{
															
 
																+				STARPU_ASSERT(copy_methods->any_to_any);
															
 
																+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
															
 
																+			}
															
 
																 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
															
 
																 			if (STARPU_UNLIKELY(cures != cudaSuccess))
															
@@ -191,12 +213,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 		break;
															
 
																 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
															
 
																 		/* CUDA - CUDA transfer */
															
 
																-		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
															
 
																-		if (!req || !copy_methods->cuda_to_cuda_async)
															
 
																+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
															
 
																+				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
															
 
																 		{
															
 
																-			STARPU_ASSERT(copy_methods->cuda_to_cuda);
															
 
																+			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
															
 
																 			/* this is not associated to a request so it's synchronous */
															
 
																-			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
															
 
																+			if (copy_methods->cuda_to_cuda)
															
 
																+				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
															
 
																+			else
															
 
																+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
@@ -205,7 +230,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 			stream = starpu_cuda_get_local_peer_transfer_stream();
															
 
																-			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																+			if (copy_methods->cuda_to_cuda_async)
															
 
																+				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																+			else
															
 
																+			{
															
 
																+				STARPU_ASSERT(copy_methods->any_to_any);
															
 
																+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
															
 
																+			}
															
 
																 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
															
 
																 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
															
@@ -215,54 +246,77 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
															
 
																 		/* OpenCL -> RAM */
															
 
																-		if (_starpu_memory_node_get_local_key() == src_node)
															
 
																+		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
															
 
																+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
															
 
																+				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
															
 
																 		{
															
 
																-			STARPU_ASSERT(copy_methods->opencl_to_ram);
															
 
																-			if (!req || !copy_methods->opencl_to_ram_async)
															
 
																-			{
															
 
																-				/* this is not associated to a request so it's synchronous */
															
 
																+			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
															
 
																+			/* this is not associated to a request so it's synchronous */
															
 
																+			if (copy_methods->opencl_to_ram)
															
 
																 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
															
 
																-			}
															
 
																 			else
															
 
																-			{
															
 
																-				req->async_channel.type = STARPU_OPENCL_RAM;
															
 
																-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
															
 
																-			}
															
 
																+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																-			/* we should not have a blocking call ! */
															
 
																-			STARPU_ABORT();
															
 
																+			req->async_channel.type = STARPU_OPENCL_RAM;
															
 
																+			if (copy_methods->opencl_to_ram_async)
															
 
																+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
															
 
																+			else
															
 
																+			{
															
 
																+				STARPU_ASSERT(copy_methods->any_to_any);
															
 
																+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
															
 
																+			}
															
 
																 		}
															
 
																 		break;
															
 
																 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
															
 
																 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
															
 
																 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
															
 
																-		STARPU_ASSERT(copy_methods->ram_to_opencl);
															
 
																-		if (!req || !copy_methods->ram_to_opencl_async)
															
 
																+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
															
 
																+				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
															
 
																 		{
															
 
																+			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
															
 
																 			/* this is not associated to a request so it's synchronous */
															
 
																-			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
															
 
																+			if (copy_methods->ram_to_opencl)
															
 
																+				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
															
 
																+			else
															
 
																+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																 			req->async_channel.type = STARPU_OPENCL_RAM;
															
 
																-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
															
 
																+			if (copy_methods->ram_to_opencl_async)
															
 
																+				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
															
 
																+			else
															
 
																+			{
															
 
																+				STARPU_ASSERT(copy_methods->any_to_any);
															
 
																+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
															
 
																+			}
															
 
																 		}
															
 
																 		break;
															
 
																 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
															
 
																 		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
															
 
																 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
															
 
																-		STARPU_ASSERT(copy_methods->opencl_to_opencl);
															
 
																-		if (!req || !copy_methods->opencl_to_opencl_async)
															
 
																+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
															
 
																+				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
															
 
																 		{
															
 
																+			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
															
 
																 			/* this is not associated to a request so it's synchronous */
															
 
																-			copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
															
 
																+			if (copy_methods->opencl_to_opencl)
															
 
																+				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
															
 
																+			else
															
 
																+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																 			req->async_channel.type = STARPU_OPENCL_RAM;
															
 
																-			ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
															
 
																+			if (copy_methods->opencl_to_opencl_async)
															
 
																+				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
															
 
																+			else
															
 
																+			{
															
 
																+				STARPU_ASSERT(copy_methods->any_to_any);
															
 
																+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
															
 
																+			}
															
 
																 		}
															
 
																 		break;
															
 
																 #endif
															
@@ -331,6 +385,64 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
																 	return 0;
															
 
																 }
															
 
																+/* This can be used by interfaces to easily transfer a piece of data without
															
 
																+ * caring about the particular CUDA/OpenCL methods.  */
															
 
																+
															
 
																+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
															
 
																+{
															
 
																+	struct _starpu_async_channel *async_channel = async_data;
															
 
																+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
															
 
																+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
															
 
																+
															
 
																+	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
															
 
																+	{
															
 
																+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
															
 
																+		memcpy((void *) dst + dst_offset, (void *) src + src_offset, size);
															
 
																+		return 0;
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
															
 
																+		return starpu_cuda_copy_async_sync(
															
 
																+				(void*) src + src_offset, src_node,
															
 
																+				(void*) dst + dst_offset, dst_node,
															
 
																+				size,
															
 
																+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
															
 
																+				cudaMemcpyDeviceToHost);
															
 
																+
															
 
																+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
															
 
																+		return starpu_cuda_copy_async_sync(
															
 
																+				(void*) src + src_offset, src_node,
															
 
																+				(void*) dst + dst_offset, dst_node,
															
 
																+				size,
															
 
																+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
															
 
																+				cudaMemcpyHostToDevice);
															
 
																+
															
 
																+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
															
 
																+		return starpu_cuda_copy_async_sync(
															
 
																+				(void*) src + src_offset, src_node,
															
 
																+				(void*) dst + dst_offset, dst_node,
															
 
																+				size,
															
 
																+				async_channel?starpu_cuda_get_local_peer_transfer_stream():NULL,
															
 
																+				cudaMemcpyDeviceToDevice);
															
 
																+
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
															
 
																+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
															
 
																+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
															
 
																+		return starpu_opencl_copy_async_sync(
															
 
																+				src, src_offset, src_node,
															
 
																+				dst, dst_offset, dst_node,
															
 
																+				size,
															
 
																+				&async_channel->event.opencl_event);
															
 
																+#endif
															
 
																+	default:
															
 
																+		STARPU_ABORT();
															
 
																+		return -1;
															
 
																+	}
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
															
 
																 {
															
 
																 #ifdef STARPU_SIMGRID
															
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -31,31 +31,11 @@
 
																  * BCSR : blocked CSR, we use blocks of size (r x c)
															
 
																  */
															
 
																-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
															
 
																-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
															
 
																-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
															
 
																-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
															
 
																-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
															
 
																-#endif
															
 
																-
															
 
																-static struct starpu_data_copy_methods bcsr_copy_data_methods_s =
															
 
																+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
 
																+
															
 
																+static const struct starpu_data_copy_methods bcsr_copy_data_methods_s =
															
 
																 {
															
 
																-	.ram_to_ram = copy_ram_to_ram,
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	.ram_to_cuda = copy_ram_to_cuda,
															
 
																-	.cuda_to_ram = copy_cuda_to_ram,
															
 
																-	.cuda_to_cuda = copy_cuda_to_cuda,
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-	.ram_to_opencl = copy_ram_to_opencl,
															
 
																-	.opencl_to_ram = copy_opencl_to_ram,
															
 
																-	.opencl_to_opencl = copy_opencl_to_opencl,
															
 
																-#endif
															
 
																+	.any_to_any = copy_any_to_any,
															
 
																 };
															
 
																 static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
															
@@ -315,105 +295,7 @@ static void free_bcsr_buffer_on_node(void *data_interface, unsigned node)
 
																 	starpu_free_buffer_on_node(node, (uintptr_t) bcsr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
															
 
																 }
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
															
 
																-{
															
 
																-	struct starpu_bcsr_interface *src_bcsr = src_interface;
															
 
																-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
															
 
																-
															
 
																-	uint32_t nnz = src_bcsr->nnz;
															
 
																-	uint32_t nrow = src_bcsr->nrow;
															
 
																-	size_t elemsize = src_bcsr->elemsize;
															
 
																-
															
 
																-	uint32_t r = src_bcsr->r;
															
 
																-	uint32_t c = src_bcsr->c;
															
 
																-
															
 
																-	cudaError_t cures;
															
 
																-
															
 
																-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, kind);
															
 
																-	if (STARPU_UNLIKELY(cures))
															
 
																-		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																-
															
 
																-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), kind);
															
 
																-	if (STARPU_UNLIKELY(cures))
															
 
																-		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																-
															
 
																-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
															
 
																-	if (STARPU_UNLIKELY(cures))
															
 
																-		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																-
															
 
																-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																-
															
 
																-	return 0;
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
															
 
																-{
															
 
																-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
															
 
																-}
															
 
																-
															
 
																-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
															
 
																-{
															
 
																-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
															
 
																-{
															
 
																-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
															
 
																-}
															
 
																-#endif // STARPU_USE_CUDA
															
 
																-
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
															
 
																-{
															
 
																-	struct starpu_bcsr_interface *src_bcsr = src_interface;
															
 
																-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
															
 
																-
															
 
																-	uint32_t nnz = src_bcsr->nnz;
															
 
																-	uint32_t nrow = src_bcsr->nrow;
															
 
																-	size_t elemsize = src_bcsr->elemsize;
															
 
																-
															
 
																-	uint32_t r = src_bcsr->r;
															
 
																-	uint32_t c = src_bcsr->c;
															
 
																-
															
 
																-        int err;
															
 
																-
															
 
																-	err = starpu_opencl_copy_async_sync(src_bcsr->nzval, src_node, 0, dst_bcsr->nzval, dst_node, 0, nnz*r*c*elemsize, NULL);
															
 
																-	if (STARPU_UNLIKELY(err))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->colind, src_node, 0, (uintptr_t)dst_bcsr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
															
 
																-	if (STARPU_UNLIKELY(err))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->rowptr, src_node, 0, (uintptr_t)dst_bcsr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
															
 
																-	if (STARPU_UNLIKELY(err))
															
 
																-		STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																-
															
 
																-	return 0;
															
 
																-}
															
 
																-
															
 
																-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
															
 
																-}
															
 
																-
															
 
																-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
															
 
																-}
															
 
																-
															
 
																-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
															
 
																-}
															
 
																-
															
 
																-#endif // STARPU_USE_OPENCL
															
 
																-
															
 
																-/* as not all platform easily have a BLAS lib installed ... */
															
 
																-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
															
 
																+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
															
 
																 {
															
 
																 	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
															
 
																 	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
															
@@ -425,13 +307,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
																 	uint32_t r = src_bcsr->r;
															
 
																 	uint32_t c = src_bcsr->c;
															
 
																-	memcpy((void *)dst_bcsr->nzval, (void *)src_bcsr->nzval, nnz*elemsize*r*c);
															
 
																+	int ret = 0;
															
 
																+
															
 
																+	if (starpu_interface_copy(src_bcsr->nzval, 0, src_node, dst_bcsr->nzval, 0, dst_node, nnz*elemsize*r*c, async_data))
															
 
																+		ret = -EAGAIN;
															
 
																-	memcpy((void *)dst_bcsr->colind, (void *)src_bcsr->colind, nnz*sizeof(uint32_t));
															
 
																+	if (starpu_interface_copy((uintptr_t)src_bcsr->colind, 0, src_node, (uintptr_t)dst_bcsr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
															
 
																+		ret = -EAGAIN;
															
 
																-	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
															
 
																+	if (starpu_interface_copy((uintptr_t)src_bcsr->rowptr, 0, src_node, (uintptr_t)dst_bcsr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
															
 
																+		ret = -EAGAIN;
															
 
																 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																-	return 0;
															
 
																+	return ret;
															
 
																 }
															
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -44,7 +44,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
																 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
															
 
																 #endif
															
 
																-static struct starpu_data_copy_methods block_copy_data_methods_s =
															
 
																+static const struct starpu_data_copy_methods block_copy_data_methods_s =
															
 
																 {
															
 
																 	.ram_to_ram = copy_ram_to_ram,
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -350,7 +350,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																-		/* Default case: we transfer all lines one by one: ny*nz transfers */
															
 
																+		/* Default case: we transfer all blocks one by one: nz transfers */
															
 
																 		unsigned layer;
															
 
																 		for (layer = 0; layer < src_block->nz; layer++)
															
 
																 		{
															
@@ -420,7 +420,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																-		/* Default case: we transfer all lines one by one: ny*nz transfers */
															
 
																+		/* Default case: we transfer all blocks one by one: nz 2D transfers */
															
 
																 		unsigned layer;
															
 
																 		for (layer = 0; layer < src_block->nz; layer++)
															
 
																 		{
															
@@ -514,8 +514,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
																 		/* Is that a single contiguous buffer ? */
															
 
																 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
															
 
																 		{
															
 
																-			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node, src_block->offset,
															
 
																-								dst_block->dev_handle, dst_node, dst_block->offset,
															
 
																+			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
															
 
																+								dst_block->dev_handle, dst_block->offset, dst_node,
															
 
																 							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
															
 
																 							       event);
															
 
																                 }
															
@@ -535,10 +535,12 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
																                         unsigned j;
															
 
																                         for(j=0 ; j<src_block->ny ; j++)
															
 
																 			{
															
 
																-				ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_node,
															
 
																+				ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
															
 
																 								    src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
															
 
																-								    dst_block->dev_handle, dst_node,
															
 
																+								    src_node,
															
 
																+								    dst_block->dev_handle,
															
 
																 								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
															
 
																+								    dst_node,
															
 
																 								       src_block->nx*src_block->elemsize,
															
 
																 								       event);
															
 
																                         }
															
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -19,190 +19,36 @@
 
																 #include <datawizard/memalloc.h>
															
 
																 static int
															
 
																-copy_ram_to_ram(void *src_interface, STARPU_ATTRIBUTE_UNUSED unsigned src_node,
															
 
																-		void *dst_interface, STARPU_ATTRIBUTE_UNUSED unsigned dst_node)
															
 
																+copy_any_to_any(void *src_interface, unsigned src_node,
															
 
																+		void *dst_interface, unsigned dst_node, void *async_data)
															
 
																 {
															
 
																 	size_t size = 0;
															
 
																 	struct starpu_coo_interface *src_coo, *dst_coo;
															
 
																-
															
 
																-	src_coo = (struct starpu_coo_interface *) src_interface;
															
 
																-	dst_coo = (struct starpu_coo_interface *) dst_interface;
															
 
																-
															
 
																-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
															
 
																-	memcpy((void *) dst_coo->columns, (void *) src_coo->columns, size);
															
 
																-
															
 
																-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
															
 
																-	memcpy((void *) dst_coo->rows, (void *) src_coo->rows, size);
															
 
																-
															
 
																-	size = src_coo->n_values * src_coo->elemsize;
															
 
																-	memcpy((void *) dst_coo->values, (void *) src_coo->values, size);
															
 
																-
															
 
																-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
															
 
																-		src_coo->n_values *
															
 
																-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
															
 
																-
															
 
																-	return 0;
															
 
																-}
															
 
																-
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-static int
															
 
																-copy_cuda_async_sync(void *src_interface, unsigned src_node,
															
 
																-		     void *dst_interface, unsigned dst_node,
															
 
																-		     cudaStream_t stream, enum cudaMemcpyKind kind)
															
 
																-{
															
 
																-	int ret;
															
 
																-	size_t size = 0;
															
 
																-	struct starpu_coo_interface *src_coo, *dst_coo;
															
 
																-
															
 
																-	src_coo = (struct starpu_coo_interface *) src_interface;
															
 
																-	dst_coo = (struct starpu_coo_interface *) dst_interface;
															
 
																-
															
 
																-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
															
 
																-	ret = starpu_cuda_copy_async_sync(
															
 
																-		(void *) src_coo->columns,
															
 
																-		src_node,
															
 
																-		(void *) dst_coo->columns,
															
 
																-		dst_node,
															
 
																-		size,
															
 
																-		stream,
															
 
																-		kind);
															
 
																-	if (ret == 0)
															
 
																-		stream = NULL;
															
 
																-
															
 
																-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
															
 
																-	ret = starpu_cuda_copy_async_sync(
															
 
																-		(void *) src_coo->rows,
															
 
																-		src_node,
															
 
																-		(void *) dst_coo->rows,
															
 
																-		dst_node,
															
 
																-		size,
															
 
																-		stream,
															
 
																-		kind);
															
 
																-	if (ret == 0)
															
 
																-		stream = NULL;
															
 
																-
															
 
																-	size = src_coo->n_values * src_coo->elemsize;
															
 
																-	ret = starpu_cuda_copy_async_sync(
															
 
																-		(void *) src_coo->values,
															
 
																-		src_node,
															
 
																-		(void *) dst_coo->values,
															
 
																-		dst_node,
															
 
																-		size,
															
 
																-		stream,
															
 
																-		kind);
															
 
																-
															
 
																-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
															
 
																-		src_coo->n_values *
															
 
																-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
															
 
																-	return ret;
															
 
																-}
															
 
																-
															
 
																-static int
															
 
																-copy_ram_to_cuda(void *src_interface, unsigned src_node,
															
 
																-		 void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node,
															
 
																-				    dst_interface, dst_node,
															
 
																-				    NULL, cudaMemcpyHostToDevice);
															
 
																-}
															
 
																-
															
 
																-static int
															
 
																-copy_cuda_to_ram(void *src_interface, unsigned src_node,
															
 
																-		 void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node,
															
 
																-				    dst_interface, dst_node,
															
 
																-				    NULL, cudaMemcpyDeviceToHost);
															
 
																-}
															
 
																-
															
 
																-static int
															
 
																-copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
															
 
																-		       void *dst_interface, unsigned dst_node,
															
 
																-		       cudaStream_t stream)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node,
															
 
																-				    dst_interface, dst_node,
															
 
																-				    stream, cudaMemcpyHostToDevice);
															
 
																-}
															
 
																-
															
 
																-static int
															
 
																-copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
															
 
																-		       void *dst_interface, unsigned dst_node,
															
 
																-		       cudaStream_t stream)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node,
															
 
																-				    dst_interface, dst_node,
															
 
																-				    stream, cudaMemcpyDeviceToHost);
															
 
																-}
															
 
																-
															
 
																-static int
															
 
																-copy_cuda_to_cuda(void *src_interface, unsigned src_node,
															
 
																-		  void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node,
															
 
																-				    dst_interface, dst_node,
															
 
																-				    NULL, cudaMemcpyDeviceToDevice);
															
 
																-}
															
 
																-
															
 
																-#ifdef NO_STRIDE
															
 
																-static int
															
 
																-copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
															
 
																-			void *dst_interface, unsigned dst_node,
															
 
																-			cudaStream_t stream)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node,
															
 
																-				    dst_interface, dst_node,
															
 
																-				    stream, cudaMemcpyDeviceToDevice);
															
 
																-}
															
 
																-#endif /* !NO_STRIDE */
															
 
																-#endif /* !STARPU_USE_CUDA */
															
 
																-
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-static int
															
 
																-copy_opencl_common(void *src_interface, unsigned src_node,
															
 
																-			 void *dst_interface, unsigned dst_node,
															
 
																-			 cl_event *event)
															
 
																-{
															
 
																 	int ret = 0;
															
 
																-	size_t size = 0;
															
 
																-	struct starpu_coo_interface *src_coo, *dst_coo;
															
 
																 	src_coo = (struct starpu_coo_interface *) src_interface;
															
 
																 	dst_coo = (struct starpu_coo_interface *) dst_interface;
															
 
																-
															
 
																 	size = src_coo->n_values * sizeof(src_coo->columns[0]);
															
 
																-	ret = starpu_opencl_copy_async_sync(
															
 
																-		(uintptr_t) src_coo->columns,
															
 
																-		src_node,
															
 
																-		0,
															
 
																-		(uintptr_t) dst_coo->columns,
															
 
																-		dst_node,
															
 
																-		0,
															
 
																-		size,
															
 
																-		NULL);
															
 
																+	if (starpu_interface_copy(
															
 
																+		(uintptr_t) src_coo->columns, 0, src_node,
															
 
																+		(uintptr_t) dst_coo->columns, 0, dst_node,
															
 
																+		size, async_data))
															
 
																+		ret = -EAGAIN;
															
 
																 	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
															
 
																-	ret = starpu_opencl_copy_async_sync(
															
 
																-		(uintptr_t) src_coo->rows,
															
 
																-		src_node,
															
 
																-		0,
															
 
																-		(uintptr_t) dst_coo->rows,
															
 
																-		dst_node,
															
 
																-		0,
															
 
																-		size,
															
 
																-		NULL);
															
 
																+	if (starpu_interface_copy(
															
 
																+		(uintptr_t) src_coo->rows, 0, src_node,
															
 
																+		(uintptr_t) dst_coo->rows, 0, dst_node,
															
 
																+		size, async_data))
															
 
																+		ret = -EAGAIN;
															
 
																 	size = src_coo->n_values * src_coo->elemsize;
															
 
																-	ret = starpu_opencl_copy_async_sync(
															
 
																-		src_coo->values,
															
 
																-		src_node,
															
 
																-		0,
															
 
																-		(uintptr_t) dst_coo->values,
															
 
																-		dst_node,
															
 
																-		0,
															
 
																-		size,
															
 
																-		event);
															
 
																+	if (starpu_interface_copy(
															
 
																+		src_coo->values, 0, src_node,
															
 
																+		dst_coo->values, 0, dst_node,
															
 
																+		size, async_data))
															
 
																+		ret = -EAGAIN;
															
 
																 	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
															
 
																 		src_coo->n_values *
															
@@ -211,83 +57,9 @@ copy_opencl_common(void *src_interface, unsigned src_node,
 
																 	return ret;
															
 
																 }
															
 
																-static int
															
 
																-copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
															
 
																-			 void *dst_interface, unsigned dst_node,
															
 
																-			 cl_event *event)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
															
 
																-}
															
 
																-
															
 
																-static int
															
 
																-copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
															
 
																-			 void *dst_interface, unsigned dst_node,
															
 
																-			 cl_event *event)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
															
 
																-}
															
 
																-
															
 
																-static int
															
 
																-copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
															
 
																-			 void *dst_interface, unsigned dst_node,
															
 
																-			 cl_event *event)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
															
 
																-}
															
 
																-
															
 
																-static int
															
 
																-copy_ram_to_opencl(void *src_interface, unsigned src_node,
															
 
																-		   void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_ram_to_opencl_async(src_interface, src_node,
															
 
																-					dst_interface, dst_node,
															
 
																-					NULL);
															
 
																-}
															
 
																-static int
															
 
																-copy_opencl_to_ram(void *src_interface, unsigned src_node,
															
 
																-		   void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_opencl_to_ram_async(src_interface, src_node,
															
 
																-					dst_interface, dst_node,
															
 
																-					NULL);
															
 
																-}
															
 
																-static int
															
 
																-copy_opencl_to_opencl(void *src_interface, unsigned src_node,
															
 
																-		   void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_opencl_to_opencl_async(src_interface, src_node,
															
 
																-					dst_interface, dst_node,
															
 
																-					NULL);
															
 
																-}
															
 
																-#endif /* !STARPU_USE_OPENCL */
															
 
																-
															
 
																-static struct starpu_data_copy_methods coo_copy_data_methods =
															
 
																+static const struct starpu_data_copy_methods coo_copy_data_methods =
															
 
																 {
															
 
																-	.ram_to_ram          = copy_ram_to_ram,
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	.ram_to_cuda         = copy_ram_to_cuda,
															
 
																-	.cuda_to_ram         = copy_cuda_to_ram,
															
 
																-	.ram_to_cuda_async   = copy_ram_to_cuda_async,
															
 
																-	.cuda_to_ram_async   = copy_cuda_to_ram_async,
															
 
																-	.cuda_to_cuda        = copy_cuda_to_cuda,
															
 
																-#ifdef NO_STRIDE
															
 
																-	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
															
 
																-#endif
															
 
																-#else
															
 
																-#ifdef STARPU_SIMGRID
															
 
																-#ifdef NO_STRIDE
															
 
																-	/* Enable GPU-GPU transfers in simgrid */
															
 
																-	.cuda_to_cuda_async = 1,
															
 
																-#endif
															
 
																-#endif
															
 
																-#endif /* !STARPU_USE_CUDA */
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-	.ram_to_opencl       = copy_ram_to_opencl,
															
 
																-	.opencl_to_ram       = copy_opencl_to_ram,
															
 
																-	.opencl_to_opencl    = copy_opencl_to_opencl,
															
 
																-	.ram_to_opencl_async = copy_ram_to_opencl_async,
															
 
																-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
															
 
																-#endif /* !STARPU_USE_OPENCL */
															
 
																+	.any_to_any          = copy_any_to_any,
															
 
																 };
															
 
																 static void
															
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -28,42 +28,11 @@
 
																 #include <starpu_opencl.h>
															
 
																 #include <drivers/opencl/driver_opencl.h>
															
 
																-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
															
 
																-#endif
															
 
																-
															
 
																-static struct starpu_data_copy_methods csr_copy_data_methods_s =
															
 
																+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
 
																+
															
 
																+static const struct starpu_data_copy_methods csr_copy_data_methods_s =
															
 
																 {
															
 
																-	.ram_to_ram = copy_ram_to_ram,
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	.ram_to_cuda = copy_ram_to_cuda,
															
 
																-	.cuda_to_ram = copy_cuda_to_ram,
															
 
																-	.cuda_to_cuda = copy_cuda_to_cuda,
															
 
																-	.ram_to_cuda_async = copy_ram_to_cuda_async,
															
 
																-	.cuda_to_ram_async = copy_cuda_to_ram_async,
															
 
																-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
															
 
																-#else
															
 
																-#ifdef STARPU_SIMGRID
															
 
																-	/* Enable GPU-GPU transfers in simgrid */
															
 
																-	.cuda_to_cuda_async = 1,
															
 
																-#endif
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-	.ram_to_opencl = copy_ram_to_opencl,
															
 
																-	.opencl_to_ram = copy_opencl_to_ram,
															
 
																-	.opencl_to_opencl = copy_opencl_to_opencl,
															
 
																-#endif
															
 
																+	.any_to_any = copy_any_to_any,
															
 
																 };
															
 
																 static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
															
@@ -293,188 +262,8 @@ static void free_csr_buffer_on_node(void *data_interface, unsigned node)
 
																 	starpu_free_buffer_on_node(node, (uintptr_t) csr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
															
 
																 }
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-static int copy_cuda_async_sync(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
															
 
																-{
															
 
																-	struct starpu_csr_interface *src_csr = src_interface;
															
 
																-	struct starpu_csr_interface *dst_csr = dst_interface;
															
 
																-
															
 
																-	uint32_t nnz = src_csr->nnz;
															
 
																-	uint32_t nrow = src_csr->nrow;
															
 
																-	size_t elemsize = src_csr->elemsize;
															
 
																-
															
 
																-	cudaStream_t sstream = stream;
															
 
																-	int ret;
															
 
																-
															
 
																-	ret = starpu_cuda_copy_async_sync((void *)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, sstream, kind);
															
 
																-	if (ret == 0) sstream = NULL;
															
 
																-
															
 
																-	ret = starpu_cuda_copy_async_sync((void *)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), sstream, kind);
															
 
																-	if (ret == 0) sstream = NULL;
															
 
																-
															
 
																-	ret = starpu_cuda_copy_async_sync((void *)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), sstream, kind);
															
 
																-
															
 
																-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																-	return ret;
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
															
 
																-				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
															
 
																-{
															
 
																-#ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																-	struct starpu_csr_interface *src_csr = src_interface;
															
 
																-	struct starpu_csr_interface *dst_csr = dst_interface;
															
 
																-
															
 
																-	uint32_t nnz = src_csr->nnz;
															
 
																-	uint32_t nrow = src_csr->nrow;
															
 
																-	size_t elemsize = src_csr->elemsize;
															
 
																-
															
 
																-	cudaError_t cures;
															
 
																-
															
 
																-	int src_dev = _starpu_memory_node_get_devid(src_node);
															
 
																-	int dst_dev = _starpu_memory_node_get_devid(dst_node);
															
 
																-
															
 
																-	int synchronous_fallback = 0;
															
 
																-
															
 
																-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																-	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
															
 
																-	if (cures)
															
 
																-	{
															
 
																-		synchronous_fallback = 1;
															
 
																-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																-		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
															
 
																-		if (STARPU_UNLIKELY(cures))
															
 
																-			STARPU_CUDA_REPORT_ERROR(cures);
															
 
																-	}
															
 
																-
															
 
																-	if (!synchronous_fallback)
															
 
																-	{
															
 
																-		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
															
 
																-	}
															
 
																-
															
 
																-	if (synchronous_fallback || cures != cudaSuccess)
															
 
																-	{
															
 
																-		synchronous_fallback = 1;
															
 
																-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																-		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
															
 
																-		if (STARPU_UNLIKELY(cures))
															
 
																-			STARPU_CUDA_REPORT_ERROR(cures);
															
 
																-	}
															
 
																-
															
 
																-	if (!synchronous_fallback)
															
 
																-	{
															
 
																-		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
															
 
																-	}
															
 
																-
															
 
																-	if (synchronous_fallback || cures != cudaSuccess)
															
 
																-	{
															
 
																-		synchronous_fallback = 1;
															
 
																-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																-		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
															
 
																-		if (STARPU_UNLIKELY(cures))
															
 
																-			STARPU_CUDA_REPORT_ERROR(cures);
															
 
																-	}
															
 
																-
															
 
																-	if (synchronous_fallback)
															
 
																-	{
															
 
																-		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																-		return 0;
															
 
																-	}
															
 
																-	else
															
 
																-	{
															
 
																-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
															
 
																-		return -EAGAIN;
															
 
																-	}
															
 
																-#else
															
 
																-	/* Illegal without Peer tranfers */
															
 
																-	STARPU_ABORT();
															
 
																-	return 0;
															
 
																-#endif
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
															
 
																-}
															
 
																-
															
 
																-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, NULL);
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
															
 
																-}
															
 
																-
															
 
																-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
															
 
																-{
															
 
																-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
															
 
																-}
															
 
																-
															
 
																-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
															
 
																-{
															
 
																-	if (src_node == dst_node)
															
 
																-		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
															
 
																-	else
															
 
																-		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
															
 
																-}
															
 
																-
															
 
																-#endif // STARPU_USE_CUDA
															
 
																-
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
															
 
																-{
															
 
																-	struct starpu_csr_interface *src_csr = src_interface;
															
 
																-	struct starpu_csr_interface *dst_csr = dst_interface;
															
 
																-
															
 
																-	uint32_t nnz = src_csr->nnz;
															
 
																-	uint32_t nrow = src_csr->nrow;
															
 
																-	size_t elemsize = src_csr->elemsize;
															
 
																-
															
 
																-        int err;
															
 
																-
															
 
																-	err = starpu_opencl_copy_async_sync(src_csr->nzval, src_node, 0, dst_csr->nzval, dst_node, 0, nnz*elemsize, NULL);
															
 
																-	if (STARPU_UNLIKELY(err))
															
 
																-                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->colind, src_node, 0, (uintptr_t)dst_csr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
															
 
																-        if (STARPU_UNLIKELY(err))
															
 
																-                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->rowptr, src_node, 0, (uintptr_t)dst_csr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
															
 
																-	if (STARPU_UNLIKELY(err))
															
 
																-                STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-
															
 
																-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																-
															
 
																-	return 0;
															
 
																-}
															
 
																-
															
 
																-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
															
 
																-}
															
 
																-
															
 
																-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
															
 
																-}
															
 
																-
															
 
																-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
															
 
																-{
															
 
																-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
															
 
																-}
															
 
																-
															
 
																-#endif // STARPU_USE_OPENCL
															
 
																-
															
 
																 /* as not all platform easily have a BLAS lib installed ... */
															
 
																-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
															
 
																+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
															
 
																 {
															
 
																 	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
															
 
																 	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
															
@@ -482,14 +271,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
																 	uint32_t nnz = src_csr->nnz;
															
 
																 	uint32_t nrow = src_csr->nrow;
															
 
																 	size_t elemsize = src_csr->elemsize;
															
 
																+	int ret = 0;
															
 
																-	memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
															
 
																+	if (starpu_interface_copy(src_csr->nzval, 0, src_node, dst_csr->nzval, 0, dst_node, nnz*elemsize, async_data))
															
 
																+		ret = -EAGAIN;
															
 
																-	memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
															
 
																+	if (starpu_interface_copy((uintptr_t)src_csr->colind, 0, src_node, (uintptr_t)dst_csr->colind, 0, dst_node, nnz*sizeof(uint32_t), async_data))
															
 
																+		ret = -EAGAIN;
															
 
																-	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
															
 
																+	if (starpu_interface_copy((uintptr_t)src_csr->rowptr, 0, src_node, (uintptr_t)dst_csr->rowptr, 0, dst_node, (nrow+1)*sizeof(uint32_t), async_data))
															
 
																+		ret = -EAGAIN;
															
 
																 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
															
 
																-	return 0;
															
 
																+	return ret;
															
 
																 }
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -290,41 +290,6 @@ void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 
																 	STARPU_ASSERT(handleptr);
															
 
																 	*handleptr = handle;
															
 
																-	int asynchronous_copy_disabled = starpu_asynchronous_copy_disabled();
															
 
																-	if (STARPU_UNLIKELY(asynchronous_copy_disabled))
															
 
																-	{
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-		ops->copy_methods->ram_to_cuda_async = NULL;
															
 
																-		ops->copy_methods->cuda_to_ram_async = NULL;
															
 
																-		ops->copy_methods->cuda_to_cuda_async = NULL;
															
 
																-#endif
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-		ops->copy_methods->ram_to_opencl_async = NULL;
															
 
																-		ops->copy_methods->opencl_to_ram_async = NULL;
															
 
																-		ops->copy_methods->opencl_to_opencl_async = NULL;
															
 
																-#endif
															
 
																-	}
															
 
																-
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	int asynchronous_cuda_copy_disabled = starpu_asynchronous_cuda_copy_disabled();
															
 
																-	if (STARPU_UNLIKELY(asynchronous_cuda_copy_disabled))
															
 
																-	{
															
 
																-		ops->copy_methods->ram_to_cuda_async = NULL;
															
 
																-		ops->copy_methods->cuda_to_ram_async = NULL;
															
 
																-		ops->copy_methods->cuda_to_cuda_async = NULL;
															
 
																-	}
															
 
																-#endif
															
 
																-
															
 
																-#ifdef STARPU_USE_OPENCL
															
 
																-	int asynchronous_opencl_copy_disabled = starpu_asynchronous_opencl_copy_disabled();
															
 
																-	if (STARPU_UNLIKELY(asynchronous_opencl_copy_disabled))
															
 
																-	{
															
 
																-		ops->copy_methods->ram_to_opencl_async = NULL;
															
 
																-		ops->copy_methods->opencl_to_ram_async = NULL;
															
 
																-		ops->copy_methods->opencl_to_opencl_async = NULL;
															
 
																-	}
															
 
																-#endif
															
 
																-
															
 
																 	/* fill the interface fields with the appropriate method */
															
 
																 	STARPU_ASSERT(ops->register_data_handle);
															
 
																 	ops->register_data_handle(handle, home_node, data_interface);
															
@@ -618,7 +583,8 @@ void starpu_data_unregister_no_coherency(starpu_data_handle_t handle)
 
																 	_starpu_data_unregister(handle, 0);
															
 
																 }
															
 
																-void starpu_data_unregister_submit(starpu_data_handle_t handle) {
															
 
																+void starpu_data_unregister_submit(starpu_data_handle_t handle)
															
 
																+{
															
 
																 	_starpu_spin_lock(&handle->header_lock);
															
 
																 	STARPU_ASSERT_MSG(!handle->lazy_unregister, "data must not be unregistered twice");
															
 
																 	handle->lazy_unregister = 1;
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -48,7 +48,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARP
 
																 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
															
 
																 #endif
															
 
																-static struct starpu_data_copy_methods matrix_copy_data_methods_s =
															
 
																+static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
															
 
																 {
															
 
																 	.ram_to_ram = copy_ram_to_ram,
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -516,8 +516,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 
																 	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
															
 
																-	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_node, src_matrix->offset,
															
 
																-					    dst_matrix->dev_handle, dst_node, dst_matrix->offset,
															
 
																+	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_matrix->offset, src_node,
															
 
																+					    dst_matrix->dev_handle, dst_matrix->offset, dst_node,
															
 
																 					    src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
															
 
																 					    event);
															
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -42,7 +42,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 
																 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
															
 
																 #endif
															
 
																-static struct starpu_data_copy_methods multiformat_copy_data_methods_s =
															
 
																+static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
															
 
																 {
															
 
																 	.ram_to_ram = copy_ram_to_ram,
															
 
																 #ifdef STARPU_USE_CUDA
															
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c