5 년 전 · 1849722415
--- a/ChangeLog
+++ b/ChangeLog
@@ -44,11 +44,16 @@ New features:
 
				   * Add starpu_data_dup_ro().
			
 
				 
			
 
				 Small changes:
			
 
				-  * Use the S4U interface of Simgrid instead of xbt and MSG.
			
 
				   * Add a synthetic energy efficiency testcase.
			
 
				 
			
 
				-StarPU 1.3.4 (git revision xxx)
			
 
				-==============================================
			
 
				+StarPU 1.3.5 (git revision xxx)
			
 
				+====================================================================
			
 
				+
			
 
				+Small changes:
			
 
				+  * Move MPI cache functions into the public API
			
 
				+
			
 
				+StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
			
 
				+====================================================================
			
 
				 
			
 
				 Small features:
			
 
				   * New environment variables STARPU_BUS_STATS_FILE and
			
@@ -69,12 +74,17 @@ Small features:
 
				   * Add field starpu_conf::precedence_over_environment_variables to ignore
			
 
				     environment variables when parameters are set directly in starpu_conf
			
 
				   * Add starpu_data_get_coordinates_array
			
 
				+  * MPI: new functions starpu_mpi_interface_datatype_register() and
			
 
				+    starpu_mpi_interface_datatype_unregister() which take a enum
			
 
				+    starpu_data_interface_id instead of a starpu_data_handle_t
			
 
				+  * New script starpu_env to set up StarPU environment variables
			
 
				   * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
			
 
				     exponential backoff limits of the number of cycles to pause while drivers
			
 
				     are spinning.
			
 
				   * Add STARPU_DISPLAY_BINDINGS environment variable and
			
 
				     starpu_display_bindings() function to display all bindings on the machine by
			
 
				     calling hwloc-ps
			
 
				+
			
 
				 Small changes:
			
 
				   * New configure option --disable-build-doc-pdf
			
 
				 
			
@@ -116,7 +126,7 @@ Small features:
 
				     STARPU_TASK_PROFILING_INFO
			
 
				   * New function starpu_create_callback_task() which creates and
			
 
				     submits an empty task with the specified callback
			
 
				-
			
 
				+  * Use the S4U interface of Simgrid instead of xbt and MSG.
			
 
				 
			
 
				 Small changes:
			
 
				    * Default modular worker queues to 2 tasks unless it's an heft
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -199,7 +199,7 @@ ctags-local:
 
				 # The pmccabe tool, see <http://www.parisc-linux.org/~bame/pmccabe/>.
			
 
				 PMCCABE = pmccabe
			
 
				 
			
 
				-VC_URL = "https://gforge.inria.fr/scm/viewvc.php/trunk/%FILENAME%?view=markup&root=starpu"
			
 
				+VC_URL = "https://gitlab.inria.fr/starpu/starpu/-/blob/master/%FILENAME%"
			
 
				 
			
 
				 # Generate a cyclomatic complexity report.  Note that examples and tests are
			
 
				 # excluded because they're not particularly relevant, and more importantly
			
--- a/README
+++ b/README
@@ -87,15 +87,11 @@ advantage of their specificities in a portable fashion.
 
				 || III. Getting StarPU ||
			
 
				 ++=====================++
			
 
				 
			
 
				-StarPU is available on https://gforge.inria.fr/projects/starpu/.
			
 
				+StarPU is available on https://gitlab.inria.fr/starpu/starpu
			
 
				 
			
 
				-The project's SVN repository can be checked out through anonymous
			
 
				-access with the following command(s).
			
 
				+The GIT repository access can be checked out with the following command.
			
 
				 
			
 
				-$ svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk
			
 
				-$ svn checkout --username anonsvn https://scm.gforge.inria.fr/svn/starpu/trunk
			
 
				-
			
 
				-The password is 'anonsvn'
			
 
				+$ git clone git@gitlab.inria.fr:starpu/starpu.git
			
 
				 
			
 
				 ++=============================++
			
 
				 || IV. Building and Installing ||
			
--- a/configure.ac
+++ b/configure.ac
@@ -18,7 +18,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 #
			
 
				-AC_INIT([StarPU], [1.3.99], [starpu-devel@lists.gforge.inria.fr], [starpu], [http://starpu.gforge.inria.fr/])
			
 
				+AC_INIT([StarPU], [1.3.99], [starpu-devel@lists.gforge.inria.fr], [starpu], [http://gitlab.inria.fr/starpu/starpu])
			
 
				 AC_CONFIG_SRCDIR(include/starpu.h)
			
 
				 AC_CONFIG_AUX_DIR([build-aux])
			
 
				 
			
--- a/contrib/ci.inria.fr/disabled/Jenkinsfile-basic
+++ b/contrib/ci.inria.fr/disabled/Jenkinsfile-basic
@@ -24,7 +24,7 @@ pipeline
 
				 	// Trigger the build
			
 
				 	triggers
			
 
				 	{
			
 
				-		// Poll gforge explicitly every hour
			
 
				+		// Poll SCM explicitly every hour
			
 
				 		pollSCM('0 * * * *')
			
 
				 	}
			
 
				 
			
--- a/contrib/ci.inria.fr/disabled/Jenkinsfile-bsd
+++ b/contrib/ci.inria.fr/disabled/Jenkinsfile-bsd
@@ -24,7 +24,7 @@ pipeline
 
				 	// Trigger the build
			
 
				 	triggers
			
 
				 	{
			
 
				-		// Poll gforge explicitly every past-half hour
			
 
				+		// Poll SCM explicitly every past-half hour
			
 
				 		pollSCM('30 * * * *')
			
 
				 	}
			
 
				 
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -304,8 +304,8 @@ endif
 
				 EXTRA_DIST += doxygen.cfg refman.tex \
			
 
				 	      $(chapters) $(images)
			
 
				 
			
 
				-# Rule to update documentation on web server. Should only be used locally.
			
 
				-PUBLISHHOST	?= gforge
			
 
				+# Rule to update documentation on web server. Should only be called from benchmarks dalton directory
			
 
				+PUBLISHDIR	?= /home/benchmarks/softs/starpu/starpu-scripts/mirror/files/doc
			
 
				 update-web: $(DOX_PDF)
			
 
				-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
			
 
				+	cp -pr starpu.pdf html $(PUBLISHDIR)
			
 
				 
			
--- a/doc/doxygen/chapters/000_introduction.doxy
+++ b/doc/doxygen/chapters/000_introduction.doxy
@@ -77,9 +77,9 @@ policies in a portable fashion (\ref HowToDefineANewSchedulingPolicy).
 
				 The remainder of this section describes the main concepts used in StarPU.
			
 
				 
			
 
				 A video is available on the StarPU website
			
 
				-http://starpu.gforge.inria.fr/ that presents these concepts in 26 minutes.
			
 
				+https://starpu.gitlabpages.inria.fr/ that presents these concepts in 26 minutes.
			
 
				 
			
 
				-Some tutorials are also available on http://starpu.gforge.inria.fr/tutorials/
			
 
				+Some tutorials are also available on https://starpu.gitlabpages.inria.fr/tutorials/
			
 
				 
			
 
				 // explain the notion of codelet and task (i.e. g(A, B)
			
 
				 
			
@@ -190,7 +190,7 @@ unregister it.
 
				 \section ResearchPapers Research Papers
			
 
				 
			
 
				 Research papers about StarPU can be found at
			
 
				-http://starpu.gforge.inria.fr/publications/.
			
 
				+https://starpu.gitlabpages.inria.fr/publications/.
			
 
				 
			
 
				 A good overview is available in the research report at
			
 
				 http://hal.archives-ouvertes.fr/inria-00467677.
			
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -61,27 +61,22 @@ script <c>configure</c>.
 
				 \subsection GettingSources Getting Sources
			
 
				 
			
 
				 StarPU's sources can be obtained from the download page of
			
 
				-the StarPU website (http://starpu.gforge.inria.fr/files/).
			
 
				+the StarPU website (https://starpu.gitlabpages.inria.fr/files/).
			
 
				 
			
 
				 All releases and the development tree of StarPU are freely available
			
 
				-on Inria's gforge under the LGPL license. Some releases are available
			
 
				+on StarPU SCM server under the LGPL license. Some releases are available
			
 
				 under the BSD license.
			
 
				 
			
 
				-The latest release can be downloaded from the Inria's gforge (http://gforge.inria.fr/frs/?group_id=1570) or
			
 
				-directly from the StarPU download page (http://starpu.gforge.inria.fr/files/).
			
 
				+The latest release can be downloaded from the StarPU download page (https://starpu.gitlabpages.inria.fr/files/).
			
 
				 
			
 
				-The latest nightly snapshot can be downloaded from the StarPU gforge website (http://starpu.gforge.inria.fr/testing/).
			
 
				-
			
 
				-\verbatim
			
 
				-$ wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
			
 
				-\endverbatim
			
 
				+The latest nightly snapshot can be downloaded from the StarPU website (https://starpu.gitlabpages.inria.fr/files/testing/).
			
 
				 
			
 
				 And finally, current development version is also accessible via git.
			
 
				 It should only be used if you need the very latest changes (i.e. less
			
 
				 than a day old!).
			
 
				 
			
 
				 \verbatim
			
 
				-$ git clone https://scm.gforge.inria.fr/anonscm/git/starpu/starpu.git
			
 
				+$ git clone git@gitlab.inria.fr:starpu/starpu.git
			
 
				 \endverbatim
			
 
				 
			
 
				 \subsection ConfiguringStarPU Configuring StarPU
			
@@ -139,7 +134,7 @@ $ make
 
				 Once everything is built, you may want to test the result. An
			
 
				 extensive set of regression tests is provided with StarPU. Running the
			
 
				 tests is done by calling <c>make check</c>. These tests are run every night
			
 
				-and the result from the main profile is publicly available (http://starpu.gforge.inria.fr/testing/master/).
			
 
				+and the result from the main profile is publicly available (https://starpu.gitlabpages/files/testing/master/).
			
 
				 
			
 
				 \verbatim
			
 
				 $ make check
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -759,8 +759,16 @@ add fine-graph starpu_mpi_cache_flush() calls during the algorithm; the effect
 
				 for the data deallocation will be the same, but it will additionally release some
			
 
				 pressure from the StarPU-MPI cache hash table during task submission.
			
 
				 
			
 
				-One can determine whether a piece of is cached with starpu_mpi_cached_receive()
			
 
				-and starpu_mpi_cached_send().
			
 
				+One can determine whether a piece of data is cached with
			
 
				+starpu_mpi_cached_receive() and starpu_mpi_cached_send().
			
 
				+
			
 
				+Functions starpu_mpi_cached_receive_set() and
			
 
				+starpu_mpi_cached_send_set() are automatically called by
			
 
				+starpu_mpi_task_insert() but can also be called directly by the
			
 
				+application. Functions starpu_mpi_cached_send_clear() and
			
 
				+starpu_mpi_cached_receive_clear() must be called to clear data from
			
 
				+the cache. They are also automatically called when using
			
 
				+starpu_mpi_task_insert().
			
 
				 
			
 
				 The whole caching behavior can be disabled thanks to the \ref STARPU_MPI_CACHE
			
 
				 environment variable. The variable \ref STARPU_MPI_CACHE_STATS can be set to <c>1</c>
			
--- a/doc/doxygen_dev/Makefile.am
+++ b/doc/doxygen_dev/Makefile.am
@@ -245,8 +245,8 @@ endif
 
				 EXTRA_DIST += doxygen.cfg refman.tex \
			
 
				 	      $(chapters) $(images)
			
 
				 
			
 
				-# Rule to update documentation on web server. Should only be used locally.
			
 
				-PUBLISHHOST	?= gforge
			
 
				+# Rule to update documentation on web server. Should only be called from benchmarks dalton directory
			
 
				+PUBLISHDIR	?= /home/benchmarks/softs/starpu/starpu-scripts/mirror/files/doc
			
 
				 update-web: $(DOX_PDF)
			
 
				-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
			
 
				+	cp -pr starpu_dev.pdf html_dev $(PUBLISHDIR)
			
 
				 
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -151,8 +151,9 @@ enum starpu_codelet_type
 
				 
			
 
				 enum starpu_task_status
			
 
				 {
			
 
				-	STARPU_TASK_INVALID,     /**< The task has just been initialized. */
			
 
				-#define STARPU_TASK_INVALID 0
			
 
				+	STARPU_TASK_INIT,        /**< The task has just been initialized. */
			
 
				+#define STARPU_TASK_INIT 0
			
 
				+#define STARPU_TASK_INVALID STARPU_TASK_INIT  /**< old name for STARPU_TASK_INIT */
			
 
				 	STARPU_TASK_BLOCKED,     /**< The task has just been
			
 
				 				    submitted, and its dependencies has not been checked yet. */
			
 
				 	STARPU_TASK_READY,       /**< The task is ready for execution. */
			
@@ -1295,7 +1296,7 @@ struct starpu_task
 
				 	.detach = 1,					\
			
 
				 	.destroy = 0,					\
			
 
				 	.regenerate = 0,				\
			
 
				-	.status = STARPU_TASK_INVALID,			\
			
 
				+	.status = STARPU_TASK_INIT,			\
			
 
				 	.profiling_info = NULL,				\
			
 
				 	.predicted = NAN,				\
			
 
				 	.predicted_transfer = NAN,			\
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -128,6 +128,17 @@ starpu_mpi_EXAMPLES	+=	\
 
				 endif
			
 
				 
			
 
				 ##################
			
 
				+# Cache examples #
			
 
				+##################
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	cache/cache		\
			
 
				+	cache/cache_disable
			
 
				+starpu_mpi_EXAMPLES +=		\
			
 
				+	cache/cache		\
			
 
				+	cache/cache_disable
			
 
				+
			
 
				+
			
 
				+##################
			
 
				 # MPI LU example #
			
 
				 ##################
			
 
				 
			
--- a/mpi/tests/cache.c
+++ b/mpi/tests/cache.c
@@ -17,7 +17,6 @@
 
				 #include <starpu_mpi.h>
			
 
				 #include <math.h>
			
 
				 #include "helper.h"
			
 
				-#include <starpu_mpi_cache.h>
			
 
				 
			
 
				 void func_cpu(void *descr[], void *_args)
			
 
				 {
			
@@ -57,7 +56,7 @@ void test(struct starpu_codelet *codelet, enum starpu_data_access_mode mode, sta
 
				 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, codelet, mode, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				 
			
 
				-	cache = _starpu_mpi_cache_received_data_get(data);
			
 
				+	cache = starpu_mpi_cached_receive(data);
			
 
				 
			
 
				 	if (rank == 1)
			
 
				 	{
			
--- a/mpi/tests/cache_disable.c
+++ b/mpi/tests/cache_disable.c
@@ -17,7 +17,6 @@
 
				 #include <starpu_mpi.h>
			
 
				 #include <math.h>
			
 
				 #include "helper.h"
			
 
				-#include <starpu_mpi_cache.h>
			
 
				 
			
 
				 void func_cpu(void *descr[], void *_args)
			
 
				 {
			
@@ -63,7 +62,7 @@ int main(int argc, char **argv)
 
				 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				 
			
 
				-	in_cache = _starpu_mpi_cache_received_data_get(data);
			
 
				+	in_cache = starpu_mpi_cached_receive(data);
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				 		STARPU_ASSERT_MSG(in_cache == 1, "Data should be in cache\n");
			
@@ -73,7 +72,7 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_cache_set(0);
			
 
				 
			
 
				 	// We check the data is no longer in the cache
			
 
				-	in_cache = _starpu_mpi_cache_received_data_get(data);
			
 
				+	in_cache = starpu_mpi_cached_receive(data);
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				 		STARPU_ASSERT_MSG(in_cache == 0, "Data should NOT be in cache\n");
			
@@ -81,7 +80,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				-	in_cache = _starpu_mpi_cache_received_data_get(data);
			
 
				+	in_cache = starpu_mpi_cached_receive(data);
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				 		STARPU_ASSERT_MSG(in_cache == 0, "Data should NOT be in cache\n");
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -422,12 +422,39 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
 
				 int starpu_mpi_cached_receive(starpu_data_handle_t data_handle);
			
 
				 
			
 
				 /**
			
 
				+ * If \p data is already available in the reception cache, return 1
			
 
				+ * If \p data is NOT available in the reception cache, add it to the
			
 
				+ * cache and return 0
			
 
				+ * Return 0 if the communication cache is not enabled
			
 
				+ */
			
 
				+int starpu_mpi_cached_receive_set(starpu_data_handle_t data);
			
 
				+
			
 
				+/**
			
 
				+ * Remove \p data from the reception cache
			
 
				+ */
			
 
				+void starpu_mpi_cached_receive_clear(starpu_data_handle_t data);
			
 
				+
			
 
				+/**
			
 
				    Test whether \p data_handle is cached for emission to node \p dest,
			
 
				    i.e. the value was previously sent to \p dest, and not flushed
			
 
				    since then.
			
 
				 */
			
 
				 int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest);
			
 
				 
			
 
				+/**
			
 
				+ * If \p data is already available in the emission cache for node
			
 
				+ * \p dest, return 1
			
 
				+ * If \p data is NOT available in the emission cache for node \p dest,
			
 
				+ * add it to the cache and return 0
			
 
				+ * Return 0 if the communication cache is not enabled
			
 
				+ */
			
 
				+int starpu_mpi_cached_send_set(starpu_data_handle_t data, int dest);
			
 
				+
			
 
				+/**
			
 
				+ * Remove \p data from the emission cache
			
 
				+ */
			
 
				+void starpu_mpi_cached_send_clear(starpu_data_handle_t data);
			
 
				+
			
 
				 /** @} */
			
 
				 
			
 
				 /**
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -346,7 +346,7 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
				 	if (me == node)
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
			
 
				-		int already_received = _starpu_mpi_cache_received_data_set(data_handle);
			
 
				+		int already_received = starpu_mpi_cached_receive_set(data_handle);
			
 
				 		if (already_received == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
			
@@ -356,7 +356,7 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 
				 	else if (me == rank)
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
			
 
				-		int already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
			
 
				+		int already_sent = starpu_mpi_cached_send_set(data_handle, node);
			
 
				 		if (already_sent == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
			
@@ -389,7 +389,7 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
				 	{
			
 
				 		MPI_Status status;
			
 
				 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
			
 
				-		int already_received = _starpu_mpi_cache_received_data_set(data_handle);
			
 
				+		int already_received = starpu_mpi_cached_receive_set(data_handle);
			
 
				 		if (already_received == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
			
@@ -399,7 +399,7 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 
				 	else if (me == rank)
			
 
				 	{
			
 
				 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
			
 
				-		int already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
			
 
				+		int already_sent = starpu_mpi_cached_send_set(data_handle, node);
			
 
				 		if (already_sent == 0)
			
 
				 		{
			
 
				 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
			
--- a/mpi/src/starpu_mpi_cache.c
+++ b/mpi/src/starpu_mpi_cache.c
@@ -172,7 +172,7 @@ static void _starpu_mpi_cache_data_remove_nolock(starpu_data_handle_t data_handl
 
				 /**************************************
			
 
				  * Received cache
			
 
				  **************************************/
			
 
				-void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
			
 
				+void starpu_mpi_cached_receive_clear(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
@@ -198,7 +198,7 @@ void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				-int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
			
 
				+int starpu_mpi_cached_receive_set(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
@@ -226,7 +226,7 @@ int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
 
				 	return already_received;
			
 
				 }
			
 
				 
			
 
				-int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data_handle)
			
 
				+int starpu_mpi_cached_receive(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	int already_received;
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
@@ -241,15 +241,10 @@ int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data_handle)
 
				 	return already_received;
			
 
				 }
			
 
				 
			
 
				-int starpu_mpi_cached_receive(starpu_data_handle_t data_handle)
			
 
				-{
			
 
				-	return _starpu_mpi_cache_received_data_get(data_handle);
			
 
				-}
			
 
				-
			
 
				 /**************************************
			
 
				  * Send cache
			
 
				  **************************************/
			
 
				-void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
			
 
				+void starpu_mpi_cached_send_clear(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	int n, size;
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
@@ -271,7 +266,7 @@ void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
			
 
				 }
			
 
				 
			
 
				-int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
			
 
				+int starpu_mpi_cached_send_set(starpu_data_handle_t data_handle, int dest)
			
 
				 {
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 
			
@@ -296,7 +291,7 @@ int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
 
				 	return already_sent;
			
 
				 }
			
 
				 
			
 
				-int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data_handle, int dest)
			
 
				+int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
			
 
				 {
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
 
				 	int already_sent;
			
@@ -311,11 +306,6 @@ int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data_handle, int dest)
 
				 	return already_sent;
			
 
				 }
			
 
				 
			
 
				-int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
			
 
				-{
			
 
				-	return _starpu_mpi_cache_sent_data_get(data_handle, dest);
			
 
				-}
			
 
				-
			
 
				 static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle)
			
 
				 {
			
 
				 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
			
--- a/mpi/src/starpu_mpi_cache.h
+++ b/mpi/src/starpu_mpi_cache.h
@@ -32,22 +32,6 @@ void _starpu_mpi_cache_shutdown();
 
				 void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle);
			
 
				 void _starpu_mpi_cache_data_clear(starpu_data_handle_t data_handle);
			
 
				 
			
 
				-/*
			
 
				- * If the data is already available in the cache, return a pointer to the data
			
 
				- * If the data is NOT available in the cache, add it to the cache and return NULL
			
 
				- */
			
 
				-int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data);
			
 
				-int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data);
			
 
				-void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data);
			
 
				-
			
 
				-/*
			
 
				- * If the data is already available in the cache, return a pointer to the data
			
 
				- * If the data is NOT available in the cache, add it to the cache and return NULL
			
 
				- */
			
 
				-int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest);
			
 
				-int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data, int dest);
			
 
				-void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data);
			
 
				-
			
 
				 void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle);
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -112,7 +112,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 		if (do_execute && mpi_rank != STARPU_MPI_PER_NODE && mpi_rank != me)
			
 
				 		{
			
 
				 			/* The node is going to execute the codelet, but it does not own the data, it needs to receive the data from the owner node */
			
 
				-			int already_received = _starpu_mpi_cache_received_data_set(data);
			
 
				+			int already_received = starpu_mpi_cached_receive_set(data);
			
 
				 			if (already_received == 0)
			
 
				 			{
			
 
				 				if (data_tag == -1)
			
@@ -126,7 +126,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 
				 		if (!do_execute && mpi_rank == me)
			
 
				 		{
			
 
				 			/* The node owns the data, but another node is going to execute the codelet, the node needs to send the data to the executee node. */
			
 
				-			int already_sent = _starpu_mpi_cache_sent_data_set(data, xrank);
			
 
				+			int already_sent = starpu_mpi_cached_send_set(data, xrank);
			
 
				 			if (already_sent == 0)
			
 
				 			{
			
 
				 				if (data_tag == -1)
			
@@ -182,8 +182,8 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 
				 		if (mode & STARPU_W || mode & STARPU_REDUX)
			
 
				 		{
			
 
				 			/* The data has been modified, it MUST be removed from the cache */
			
 
				-			_starpu_mpi_cache_sent_data_clear(data);
			
 
				-			_starpu_mpi_cache_received_data_clear(data);
			
 
				+			starpu_mpi_cached_send_clear(data);
			
 
				+			starpu_mpi_cached_receive_clear(data);
			
 
				 		}
			
 
				 	}
			
 
				 	else
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -96,8 +96,6 @@ starpu_mpi_TESTS =
 
				 
			
 
				 starpu_mpi_TESTS +=				\
			
 
				 	broadcast				\
			
 
				-	cache					\
			
 
				-	cache_disable				\
			
 
				 	callback				\
			
 
				 	driver					\
			
 
				 	early_request				\
			
@@ -192,8 +190,6 @@ noinst_PROGRAMS +=				\
 
				 	block_interface_pinned			\
			
 
				 	attr					\
			
 
				 	broadcast				\
			
 
				-	cache					\
			
 
				-	cache_disable				\
			
 
				 	callback				\
			
 
				 	matrix					\
			
 
				 	matrix2					\
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -63,10 +63,21 @@ static struct _starpu_cg *create_cg_tag(unsigned ntags, struct _starpu_tag *tag)
 
				 
			
 
				 	cg->ntags = ntags;
			
 
				 	cg->remaining = ntags;
			
 
				+#ifdef STARPU_DEBUG
			
 
				+	cg->ndeps = ntags;
			
 
				+	cg->deps = NULL;
			
 
				+	cg->done = NULL;
			
 
				+#endif
			
 
				 	cg->cg_type = STARPU_CG_TAG;
			
 
				 
			
 
				 	cg->succ.tag = tag;
			
 
				 	tag->tag_successors.ndeps++;
			
 
				+#ifdef STARPU_DEBUG
			
 
				+	_STARPU_REALLOC(tag->tag_successors.deps, tag->tag_successors.ndeps * sizeof(tag->tag_successors.deps[0]));
			
 
				+	_STARPU_REALLOC(tag->tag_successors.done, tag->tag_successors.ndeps * sizeof(tag->tag_successors.done[0]));
			
 
				+	tag->tag_successors.deps[tag->tag_successors.ndeps-1] = cg;
			
 
				+	tag->tag_successors.done[tag->tag_successors.ndeps-1] = 0;
			
 
				+#endif
			
 
				 
			
 
				 	return cg;
			
 
				 }
			
@@ -364,10 +375,20 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 
				 	struct _starpu_cg *cg = create_cg_tag(ndeps, tag_child);
			
 
				 	_starpu_spin_unlock(&tag_child->lock);
			
 
				 
			
 
				+#ifdef STARPU_DEBUG
			
 
				+	_STARPU_MALLOC(cg->deps, ndeps * sizeof(cg->deps[0]));
			
 
				+	_STARPU_MALLOC(cg->done, ndeps * sizeof(cg->done[0]));
			
 
				+#endif
			
 
				+
			
 
				 	for (i = 0; i < ndeps; i++)
			
 
				 	{
			
 
				 		starpu_tag_t dep_id = array[i];
			
 
				 
			
 
				+#ifdef STARPU_DEBUG
			
 
				+		cg->deps[i] = (void*) (uintptr_t) dep_id;
			
 
				+		cg->done[i] = 0;
			
 
				+#endif
			
 
				+
			
 
				 		/* id depends on dep_id
			
 
				 		 * so cg should be among dep_id's successors*/
			
 
				 		_STARPU_TRACE_TAG_DEPS(id, dep_id);
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -723,7 +723,7 @@ void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_j
 
				 	{
			
 
				 		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
			
 
				 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
			
 
				-				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
			
 
				+				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated, or fix the STARPU_HOSTNAME and STARPU_PERF_MODEL_DIR environment variables",
			
 
				 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
			
 
				                 /* TODO: option to add variance according to performance model,
			
 
				                  * to be able to easily check scheduling robustness */
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -288,8 +288,8 @@ void starpu_task_init(struct starpu_task *task)
 
				 
			
 
				 	task->detach = 1;
			
 
				 
			
 
				-#if STARPU_TASK_INVALID != 0
			
 
				-	task->status = STARPU_TASK_INVALID;
			
 
				+#if STARPU_TASK_INIT != 0
			
 
				+	task->status = STARPU_TASK_INIT;
			
 
				 #endif
			
 
				 
			
 
				 	task->predicted = NAN;
			
@@ -766,9 +766,9 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 
				 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
			
 
				 
			
 
				 	if (task->status == STARPU_TASK_STOPPED || task->status == STARPU_TASK_FINISHED)
			
 
				-		task->status = STARPU_TASK_INVALID;
			
 
				+		task->status = STARPU_TASK_INIT;
			
 
				 	else
			
 
				-		STARPU_ASSERT(task->status == STARPU_TASK_INVALID);
			
 
				+		STARPU_ASSERT(task->status == STARPU_TASK_INIT);
			
 
				 
			
 
				 	if (j->internal)
			
 
				 	{
			
@@ -1067,7 +1067,7 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
				 	_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops, j->task);
			
 
				 	_starpu_job_set_ordered_buffers(j);
			
 
				 
			
 
				-	STARPU_ASSERT(task->status == STARPU_TASK_INVALID);
			
 
				+	STARPU_ASSERT(task->status == STARPU_TASK_INIT);
			
 
				 	task->status = STARPU_TASK_READY;
			
 
				 	_starpu_profiling_set_task_push_start_time(task);
			
 
				 
			
@@ -1668,7 +1668,7 @@ struct starpu_task *starpu_task_ft_create_retry
 
				 	new_task->failed = 0;
			
 
				 	new_task->scheduled = 0;
			
 
				 	new_task->prefetched = 0;
			
 
				-	new_task->status = STARPU_TASK_INVALID;
			
 
				+	new_task->status = STARPU_TASK_INIT;
			
 
				 	new_task->profiling_info = NULL;
			
 
				 	new_task->prev = NULL;
			
 
				 	new_task->next = NULL;
			
--- a/src/core/task_bundle.c
+++ b/src/core/task_bundle.c
@@ -50,7 +50,7 @@ int starpu_task_bundle_insert(starpu_task_bundle_t bundle, struct starpu_task *t
 
				 		return -EPERM;
			
 
				 	}
			
 
				 
			
 
				-	if (task->status != STARPU_TASK_INVALID)
			
 
				+	if (task->status != STARPU_TASK_INIT)
			
 
				 	{
			
 
				 		/* The task has already been submitted, it's too late to put it
			
 
				 		 * into a bundle now. */
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -227,7 +227,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 
				 			*pre_sync_jobid = pre_sync_job->job_id;
			
 
				 
			
 
				 		wrapper->post_sync_task = starpu_task_create();
			
 
				-		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_post";
			
 
				+		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_release";
			
 
				 		wrapper->post_sync_task->detach = 1;
			
 
				 		wrapper->post_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
			
 
				 		post_sync_job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -443,151 +443,6 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/* TODO: factorize with dmda!! */
			
 
				-static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				-	int best = -1;
			
 
				-
			
 
				-	double best_exp_end_of_task = 0.0;
			
 
				-	double model_best = 0.0;
			
 
				-	double transfer_model_best = 0.0;
			
 
				-
			
 
				-	int ntasks_best = -1;
			
 
				-	double ntasks_best_end = 0.0;
			
 
				-	int calibrating = 0;
			
 
				-
			
 
				-	/* A priori, we know all estimations */
			
 
				-	int unknown = 0;
			
 
				-
			
 
				-	unsigned best_impl = 0;
			
 
				-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
 
				-
			
 
				-	struct starpu_sched_ctx_iterator it;
			
 
				-
			
 
				-	double now = starpu_timing_now();
			
 
				-
			
 
				-	workers->init_iterator_for_parallel_tasks(workers, &it, task);
			
 
				-	while(workers->has_next(workers, &it))
			
 
				-	{
			
 
				-		unsigned nimpl;
			
 
				-		unsigned impl_mask;
			
 
				-		unsigned worker = workers->get_next(workers, &it);
			
 
				-		struct _starpu_fifo_taskq *fifo  = &dt->queue_array[worker];
			
 
				-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
			
 
				-
			
 
				-		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		double exp_start = isnan(fifo->exp_start) ? now + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, now);
			
 
				-
			
 
				-		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
			
 
				-			continue;
			
 
				-
			
 
				-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				-		{
			
 
				-			if (!(impl_mask & (1U << nimpl)))
			
 
				-			{
			
 
				-				/* no one on that queue may execute this task */
			
 
				-				continue;
			
 
				-			}
			
 
				-
			
 
				-			double exp_end;
			
 
				-			double local_length = starpu_task_worker_expected_length(task, worker, sched_ctx_id, nimpl);
			
 
				-			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
			
 
				-			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-
			
 
				-			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
			
 
				-
			
 
				-			/*
			
 
				-			 * This implements a default greedy scheduler for the
			
 
				-			 * case of tasks which have no performance model, or
			
 
				-			 * whose performance model is not calibrated yet.
			
 
				-			 *
			
 
				-			 * It simply uses the number of tasks already pushed to
			
 
				-			 * the workers, divided by the relative performance of
			
 
				-			 * a CPU and of a GPU.
			
 
				-			 *
			
 
				-			 * This is always computed, but the ntasks_best
			
 
				-			 * selection is only really used if the task indeed has
			
 
				-			 * no performance model, or is not calibrated yet.
			
 
				-			 */
			
 
				-			if (ntasks_best == -1
			
 
				-
			
 
				-			    /* Always compute the greedy decision, at least for
			
 
				-			     * the tasks with no performance model. */
			
 
				-			    || (!calibrating && ntasks_end < ntasks_best_end)
			
 
				-
			
 
				-			    /* The performance model of this task is not
			
 
				-			     * calibrated on this worker, try to run it there
			
 
				-			     * to calibrate it there. */
			
 
				-			    || (!calibrating && isnan(local_length))
			
 
				-
			
 
				-			    /* the performance model of this task is not
			
 
				-			     * calibrated on this worker either, rather run it
			
 
				-			     * there if this one is low on scheduled tasks. */
			
 
				-			    || (calibrating && isnan(local_length) && ntasks_end < ntasks_best_end)
			
 
				-				)
			
 
				-			{
			
 
				-				ntasks_best_end = ntasks_end;
			
 
				-				ntasks_best = worker;
			
 
				-				best_impl = nimpl;
			
 
				-			}
			
 
				-
			
 
				-			if (isnan(local_length))
			
 
				-			{
			
 
				-				/* we are calibrating, we want to speed-up calibration time
			
 
				-				 * so we privilege non-calibrated tasks (but still
			
 
				-				 * greedily distribute them to avoid dumb schedules) */
			
 
				-				static int warned;
			
 
				-				if (!warned)
			
 
				-				{
			
 
				-					warned = 1;
			
 
				-					_STARPU_DISP("Warning: performance model for %s not finished calibrating on worker %u, using a dumb scheduling heuristic for now\n", starpu_task_get_name(task), worker);
			
 
				-				}
			
 
				-				calibrating = 1;
			
 
				-			}
			
 
				-
			
 
				-			if (isnan(local_length) || _STARPU_IS_ZERO(local_length))
			
 
				-				/* there is no prediction available for that task
			
 
				-				 * with that arch yet, so switch to a greedy strategy */
			
 
				-				unknown = 1;
			
 
				-
			
 
				-			if (unknown)
			
 
				-				continue;
			
 
				-
			
 
				-			exp_end = exp_start + fifo->exp_len + local_length;
			
 
				-
			
 
				-			if (best == -1 || exp_end < best_exp_end_of_task)
			
 
				-			{
			
 
				-				/* a better solution was found */
			
 
				-				best_exp_end_of_task = exp_end;
			
 
				-				best = worker;
			
 
				-				model_best = local_length;
			
 
				-				transfer_model_best = local_penalty;
			
 
				-				best_impl = nimpl;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if (unknown)
			
 
				-	{
			
 
				-		best = ntasks_best;
			
 
				-		model_best = 0.0;
			
 
				-		transfer_model_best = 0.0;
			
 
				-#ifdef STARPU_VERBOSE
			
 
				-		dt->eager_task_cnt++;
			
 
				-#endif
			
 
				-	}
			
 
				-
			
 
				-	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
			
 
				-
			
 
				-	starpu_task_set_implementation(task, best_impl);
			
 
				-
			
 
				-	starpu_sched_task_break(task);
			
 
				-	/* we should now have the best worker in variable "best" */
			
 
				-	return push_task_on_best_worker(task, best,
			
 
				-					model_best, transfer_model_best, prio, sched_ctx_id);
			
 
				-}
			
 
				-
			
 
				 /* TODO: factorise CPU computations, expensive with a lot of cores */
			
 
				 static void compute_all_performance_predictions(struct starpu_task *task,
			
 
				 						unsigned nworkers,
			
@@ -677,15 +532,19 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 			{
			
 
				 				/* TODO : conversion time */
			
 
				 				local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
			
 
				-				local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				-				local_energy[worker_ctx][nimpl] = starpu_task_bundle_expected_energy(bundle, perf_arch,nimpl);
			
 
				+				if (local_data_penalty)
			
 
				+					local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				+				if (local_energy)
			
 
				+					local_energy[worker_ctx][nimpl] = starpu_task_bundle_expected_energy(bundle, perf_arch,nimpl);
			
 
				 
			
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				 				local_task_length[worker_ctx][nimpl] = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, nimpl);
			
 
				-				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
			
 
				-				local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
			
 
				+				if (local_data_penalty)
			
 
				+					local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
			
 
				+				if (local_energy)
			
 
				+					local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
			
 
				 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
			
 
				 				if (conversion_time > 0.0)
			
 
				 					local_task_length[worker_ctx][nimpl] += conversion_time;
			
@@ -742,7 +601,10 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 			if (unknown)
			
 
				 				continue;
			
 
				 
			
 
				-			double task_starting_time = STARPU_MAX(exp_start + prev_exp_len, now + local_data_penalty[worker_ctx][nimpl]); 
			
 
				+			double task_starting_time = exp_start + prev_exp_len;
			
 
				+			if (local_data_penalty)
			
 
				+				task_starting_time = STARPU_MAX(task_starting_time,
			
 
				+					now + local_data_penalty[worker_ctx][nimpl]);
			
 
				 
			
 
				 			exp_end[worker_ctx][nimpl] = task_starting_time + local_task_length[worker_ctx][nimpl];
			
 
				 
			
@@ -753,8 +615,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 				nimpl_best = nimpl;
			
 
				 			}
			
 
				 
			
 
				-			if (isnan(local_energy[worker_ctx][nimpl]))
			
 
				-				local_energy[worker_ctx][nimpl] = 0.;
			
 
				+			if (local_energy)
			
 
				+				if (isnan(local_energy[worker_ctx][nimpl]))
			
 
				+					local_energy[worker_ctx][nimpl] = 0.;
			
 
				 
			
 
				 		}
			
 
				 		worker_ctx++;
			
@@ -774,7 +637,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 	*max_exp_endp_of_workers = max_exp_end_of_workers;
			
 
				 }
			
 
				 
			
 
				-static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id, unsigned simulate, unsigned sorted_decision)
			
 
				+static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id, unsigned da, unsigned simulate, unsigned sorted_decision)
			
 
				 {
			
 
				 	/* find the queue */
			
 
				 	int best = -1, best_in_ctx = -1;
			
@@ -812,8 +675,8 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
				 					    exp_end,
			
 
				 					    &max_exp_end_of_workers,
			
 
				 					    &min_exp_end_of_task,
			
 
				-					    local_data_penalty,
			
 
				-					    local_energy,
			
 
				+					    da ? local_data_penalty : NULL,
			
 
				+					    da ? local_energy : NULL,
			
 
				 					    &forced_best,
			
 
				 					    &forced_impl, sched_ctx_id, sorted_decision);
			
 
				 
			
@@ -840,11 +703,14 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
				 					/* no one on that queue may execute this task */
			
 
				 					continue;
			
 
				 				}
			
 
				-				fitness[worker_ctx][nimpl] = dt->alpha * __s_alpha__value *(exp_end[worker_ctx][nimpl] - min_exp_end_of_task)
			
 
				-					+ dt->beta * __s_beta__value *(local_data_penalty[worker_ctx][nimpl])
			
 
				-					+ dt->_gamma * __s_gamma__value *(local_energy[worker_ctx][nimpl]);
			
 
				+				if (da)
			
 
				+					fitness[worker_ctx][nimpl] = dt->alpha * __s_alpha__value *(exp_end[worker_ctx][nimpl] - min_exp_end_of_task)
			
 
				+						+ dt->beta * __s_beta__value *(local_data_penalty[worker_ctx][nimpl])
			
 
				+						+ dt->_gamma * __s_gamma__value *(local_energy[worker_ctx][nimpl]);
			
 
				+				else
			
 
				+					fitness[worker_ctx][nimpl] = exp_end[worker_ctx][nimpl] - min_exp_end_of_task;
			
 
				 
			
 
				-				if (exp_end[worker_ctx][nimpl] > max_exp_end_of_workers)
			
 
				+				if (da && exp_end[worker_ctx][nimpl] > max_exp_end_of_workers)
			
 
				 				{
			
 
				 					/* This placement will make the computation
			
 
				 					 * longer, take into account the idle
			
@@ -886,15 +752,17 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
				 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx, sched_ctx_id);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(best);
			
 
				 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
			
 
				-		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+		if (da)
			
 
				+			transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		model_best = local_task_length[best_in_ctx][selected_impl];
			
 
				-		transfer_model_best = local_data_penalty[best_in_ctx][selected_impl];
			
 
				+		if (da)
			
 
				+			transfer_model_best = local_data_penalty[best_in_ctx][selected_impl];
			
 
				 	}
			
 
				 
			
 
				-	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
			
 
				+	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", selected_impl);
			
 
				 	starpu_task_set_implementation(task, selected_impl);
			
 
				 
			
 
				 	starpu_sched_task_break(task);
			
@@ -911,7 +779,7 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
				 
			
 
				 static int dmda_push_sorted_decision_task(struct starpu_task *task)
			
 
				 {
			
 
				-	return _dmda_push_task(task, 1, task->sched_ctx, 0, 1);
			
 
				+	return _dmda_push_task(task, 1, task->sched_ctx, 1, 0, 1);
			
 
				 }
			
 
				 
			
 
				 static int dmda_push_sorted_task(struct starpu_task *task)
			
@@ -919,35 +787,40 @@ static int dmda_push_sorted_task(struct starpu_task *task)
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning TODO: after defining a scheduling window, use that instead of empty_ctx_tasks
			
 
				 #endif
			
 
				-	return _dmda_push_task(task, 1, task->sched_ctx, 0, 0);
			
 
				+	return _dmda_push_task(task, 1, task->sched_ctx, 1, 0, 0);
			
 
				 }
			
 
				 
			
 
				 static int dm_push_task(struct starpu_task *task)
			
 
				 {
			
 
				-	return _dm_push_task(task, 0, task->sched_ctx);
			
 
				+	return _dmda_push_task(task, 0, task->sched_ctx, 0, 0, 0);
			
 
				+}
			
 
				+
			
 
				+static double dm_simulate_push_task(struct starpu_task *task)
			
 
				+{
			
 
				+	return _dmda_push_task(task, 0, task->sched_ctx, 0, 1, 0);
			
 
				 }
			
 
				 
			
 
				 static int dmda_push_task(struct starpu_task *task)
			
 
				 {
			
 
				 	STARPU_ASSERT(task);
			
 
				-	return _dmda_push_task(task, 0, task->sched_ctx, 0, 0);
			
 
				+	return _dmda_push_task(task, 0, task->sched_ctx, 1, 0, 0);
			
 
				 }
			
 
				 static double dmda_simulate_push_task(struct starpu_task *task)
			
 
				 {
			
 
				 	STARPU_ASSERT(task);
			
 
				-	return _dmda_push_task(task, 0, task->sched_ctx, 1, 0);
			
 
				+	return _dmda_push_task(task, 0, task->sched_ctx, 1, 1, 0);
			
 
				 }
			
 
				 
			
 
				 static double dmda_simulate_push_sorted_task(struct starpu_task *task)
			
 
				 {
			
 
				 	STARPU_ASSERT(task);
			
 
				-	return _dmda_push_task(task, 1, task->sched_ctx, 1, 0);
			
 
				+	return _dmda_push_task(task, 1, task->sched_ctx, 1, 1, 0);
			
 
				 }
			
 
				 
			
 
				 static double dmda_simulate_push_sorted_decision_task(struct starpu_task *task)
			
 
				 {
			
 
				 	STARPU_ASSERT(task);
			
 
				-	return _dmda_push_task(task, 1, task->sched_ctx, 1, 1);
			
 
				+	return _dmda_push_task(task, 1, task->sched_ctx, 1, 1, 1);
			
 
				 }
			
 
				 
			
 
				 #ifdef NOTIFY_READY_SOON
			
@@ -1092,7 +965,7 @@ static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 
				 	starpu_worker_unlock_self();
			
 
				 }
			
 
				 
			
 
				-static void dmda_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id)
			
 
				+static void _dm_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id, int da)
			
 
				 {
			
 
				 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 	struct _starpu_fifo_taskq *fifo = &dt->queue_array[workerid];
			
@@ -1100,8 +973,11 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
				 	/* Compute the expected penality */
			
 
				 	double predicted = starpu_task_worker_expected_length(task, perf_workerid, STARPU_NMAX_SCHED_CTXS,
			
 
				 						       starpu_task_get_implementation(task));
			
 
				+	double predicted_transfer = NAN;
			
 
				+
			
 
				+	if (da)
			
 
				+		predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);
			
 
				 
			
 
				-	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);
			
 
				 	double now = starpu_timing_now();
			
 
				 
			
 
				 	/* Update the predictions */
			
@@ -1110,32 +986,35 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
				 	fifo->exp_start = isnan(fifo->exp_start) ? now + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, now);
			
 
				 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				-	/* If there is no prediction available, we consider the task has a null length */
			
 
				-	if (!isnan(predicted_transfer))
			
 
				+	if (da)
			
 
				 	{
			
 
				-		if (now + predicted_transfer < fifo->exp_end)
			
 
				+		/* If there is no prediction available, we consider the task has a null length */
			
 
				+		if (!isnan(predicted_transfer))
			
 
				 		{
			
 
				-			/* We may hope that the transfer will be finished by
			
 
				-			 * the start of the task. */
			
 
				-			predicted_transfer = 0;
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			/* The transfer will not be finished by then, take the
			
 
				-			 * remainder into account */
			
 
				-			predicted_transfer = (now + predicted_transfer) - fifo->exp_end;
			
 
				-		}
			
 
				-		task->predicted_transfer = predicted_transfer;
			
 
				-		fifo->exp_end += predicted_transfer;
			
 
				-		fifo->exp_len += predicted_transfer;
			
 
				-		if(dt->num_priorities != -1)
			
 
				-		{
			
 
				-			int i;
			
 
				-			int task_prio = _starpu_normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
			
 
				-			for(i = 0; i <= task_prio; i++)
			
 
				-				fifo->exp_len_per_priority[i] += predicted_transfer;
			
 
				-		}
			
 
				+			if (now + predicted_transfer < fifo->exp_end)
			
 
				+			{
			
 
				+				/* We may hope that the transfer will be finished by
			
 
				+				 * the start of the task. */
			
 
				+				predicted_transfer = 0;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* The transfer will not be finished by then, take the
			
 
				+				 * remainder into account */
			
 
				+				predicted_transfer = (now + predicted_transfer) - fifo->exp_end;
			
 
				+			}
			
 
				+			task->predicted_transfer = predicted_transfer;
			
 
				+			fifo->exp_end += predicted_transfer;
			
 
				+			fifo->exp_len += predicted_transfer;
			
 
				+			if(dt->num_priorities != -1)
			
 
				+			{
			
 
				+				int i;
			
 
				+				int task_prio = _starpu_normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
			
 
				+				for(i = 0; i <= task_prio; i++)
			
 
				+					fifo->exp_len_per_priority[i] += predicted_transfer;
			
 
				+			}
			
 
				 
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	/* If there is no prediction available, we consider the task has a null length */
			
@@ -1166,6 +1045,16 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
				 	starpu_worker_unlock(workerid);
			
 
				 }
			
 
				 
			
 
				+static void dm_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id)
			
 
				+{
			
 
				+	_dm_push_task_notify(task, workerid, perf_workerid, sched_ctx_id, 0);
			
 
				+}
			
 
				+
			
 
				+static void dmda_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id)
			
 
				+{
			
 
				+	_dm_push_task_notify(task, workerid, perf_workerid, sched_ctx_id, 1);
			
 
				+}
			
 
				+
			
 
				 static void dmda_post_exec_hook(struct starpu_task * task, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
@@ -1183,7 +1072,8 @@ struct starpu_sched_policy _starpu_sched_dm_policy =
 
				 	.add_workers = dmda_add_workers ,
			
 
				 	.remove_workers = dmda_remove_workers,
			
 
				 	.push_task = dm_push_task,
			
 
				-	.simulate_push_task = NULL,
			
 
				+	.simulate_push_task = dm_simulate_push_task,
			
 
				+	.push_task_notify = dm_push_task_notify,
			
 
				 	.pop_task = dmda_pop_task,
			
 
				 	.pre_exec_hook = dmda_pre_exec_hook,
			
 
				 	.post_exec_hook = dmda_post_exec_hook,
			
--- a/tools/gdbinit
+++ b/tools/gdbinit
@@ -40,7 +40,7 @@ define starpu-print-task
 
				   set $job = (struct _starpu_job *)$task->starpu_private
			
 
				   set $status=0
			
 
				   if $task->status == 0
			
 
				-    set $status="STARPU_TASK_INVALID"
			
 
				+    set $status="STARPU_TASK_INIT"
			
 
				   end
			
 
				   if $task->status == 1
			
 
				     set $status="STARPU_TASK_BLOCKED"
			
@@ -98,10 +98,11 @@ define starpu-print-task
 
				       if ! $job->job_successors->done[$n]
			
 
				         set $cg = $job->job_successors->deps[$n]
			
 
				         set $m = 0
			
 
				+        printf "\t\tcg:\t\t\t<%u>\n", $cg->ndeps
			
 
				 	while $m < $cg->ndeps
			
 
				 	  if ! $cg->done[$m]
			
 
				 	    set $depj = (struct _starpu_job *) $cg->deps[$m]
			
 
				-            printf "\t\ttask %p\n", $depj->task
			
 
				+            printf "\t\t\ttask %p\n", $depj->task
			
 
				 	  end
			
 
				 	  set $m = $m + 1
			
 
				 	end
			
@@ -219,7 +220,31 @@ define starpu-print-tag
 
				   if $tag_struct->state == STARPU_DONE
			
 
				      set $status="STARPU_DONE"
			
 
				   end
			
 
				-  printf "tag %d state %s\n", $arg0, $status
			
 
				+  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
			
 
				+  printf "\tstate %s\n", $status
			
 
				+  printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
			
 
				+  printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
			
 
				+  printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
			
 
				+  if _starpu_debug
			
 
				+    set $n = 0
			
 
				+    while $n < $tag_struct->tag_successors->ndeps
			
 
				+      if ! $tag_struct->tag_successors->done[$n]
			
 
				+        set $cg = $tag_struct->tag_successors->deps[$n]
			
 
				+        set $m = 0
			
 
				+        printf "\t\tcg:\t\t\t<%u>\n", $cg->ndeps
			
 
				+	while $m < $cg->ndeps
			
 
				+	  if ! $cg->done[$m]
			
 
				+	    set $dept = (starpu_tag_t) $cg->deps[$m]
			
 
				+            printf "\t\t\ttag %u\n", $dept
			
 
				+	  end
			
 
				+	  set $m = $m + 1
			
 
				+	end
			
 
				+      end
			
 
				+      set $n = $n + 1
			
 
				+    end
			
 
				+  end
			
 
				+  printf "\tndeps_completed:\t\t<%u>\n", $tag_struct->tag_successors->ndeps_completed
			
 
				+  printf "\tnsuccs:\t\t\t\t<%u>\n", $tag_struct->tag_successors->nsuccs
			
 
				 end
			
 
				 
			
 
				 define starpu-tags