Browse Source

Merge branch 'master' into fpga

Nathalie Furmento 5 years ago
parent
commit
1849722415

+ 14 - 4
ChangeLog

@@ -44,11 +44,16 @@ New features:
   * Add starpu_data_dup_ro().
 
 Small changes:
-  * Use the S4U interface of Simgrid instead of xbt and MSG.
   * Add a synthetic energy efficiency testcase.
 
-StarPU 1.3.4 (git revision xxx)
-==============================================
+StarPU 1.3.5 (git revision xxx)
+====================================================================
+
+Small changes:
+  * Move MPI cache functions into the public API
+
+StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
+====================================================================
 
 Small features:
   * New environment variables STARPU_BUS_STATS_FILE and
@@ -69,12 +74,17 @@ Small features:
   * Add field starpu_conf::precedence_over_environment_variables to ignore
     environment variables when parameters are set directly in starpu_conf
   * Add starpu_data_get_coordinates_array
+  * MPI: new functions starpu_mpi_interface_datatype_register() and
+    starpu_mpi_interface_datatype_unregister() which take a enum
+    starpu_data_interface_id instead of a starpu_data_handle_t
+  * New script starpu_env to set up StarPU environment variables
   * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
     exponential backoff limits of the number of cycles to pause while drivers
     are spinning.
   * Add STARPU_DISPLAY_BINDINGS environment variable and
     starpu_display_bindings() function to display all bindings on the machine by
     calling hwloc-ps
+
 Small changes:
   * New configure option --disable-build-doc-pdf
 
@@ -116,7 +126,7 @@ Small features:
     STARPU_TASK_PROFILING_INFO
   * New function starpu_create_callback_task() which creates and
     submits an empty task with the specified callback
-
+  * Use the S4U interface of Simgrid instead of xbt and MSG.
 
 Small changes:
    * Default modular worker queues to 2 tasks unless it's an heft

+ 1 - 1
Makefile.am

@@ -199,7 +199,7 @@ ctags-local:
 # The pmccabe tool, see <http://www.parisc-linux.org/~bame/pmccabe/>.
 PMCCABE = pmccabe
 
-VC_URL = "https://gforge.inria.fr/scm/viewvc.php/trunk/%FILENAME%?view=markup&root=starpu"
+VC_URL = "https://gitlab.inria.fr/starpu/starpu/-/blob/master/%FILENAME%"
 
 # Generate a cyclomatic complexity report.  Note that examples and tests are
 # excluded because they're not particularly relevant, and more importantly

+ 3 - 7
README

@@ -87,15 +87,11 @@ advantage of their specificities in a portable fashion.
 || III. Getting StarPU ||
 ++=====================++
 
-StarPU is available on https://gforge.inria.fr/projects/starpu/.
+StarPU is available on https://gitlab.inria.fr/starpu/starpu
 
-The project's SVN repository can be checked out through anonymous
-access with the following command(s).
+The GIT repository access can be checked out with the following command.
 
-$ svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk
-$ svn checkout --username anonsvn https://scm.gforge.inria.fr/svn/starpu/trunk
-
-The password is 'anonsvn'
+$ git clone git@gitlab.inria.fr:starpu/starpu.git
 
 ++=============================++
 || IV. Building and Installing ||

+ 1 - 1
configure.ac

@@ -18,7 +18,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
-AC_INIT([StarPU], [1.3.99], [starpu-devel@lists.gforge.inria.fr], [starpu], [http://starpu.gforge.inria.fr/])
+AC_INIT([StarPU], [1.3.99], [starpu-devel@lists.gforge.inria.fr], [starpu], [http://gitlab.inria.fr/starpu/starpu])
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_AUX_DIR([build-aux])
 

+ 1 - 1
contrib/ci.inria.fr/disabled/Jenkinsfile-basic

@@ -24,7 +24,7 @@ pipeline
 	// Trigger the build
 	triggers
 	{
-		// Poll gforge explicitly every hour
+		// Poll SCM explicitly every hour
 		pollSCM('0 * * * *')
 	}
 

+ 1 - 1
contrib/ci.inria.fr/disabled/Jenkinsfile-bsd

@@ -24,7 +24,7 @@ pipeline
 	// Trigger the build
 	triggers
 	{
-		// Poll gforge explicitly every past-half hour
+		// Poll SCM explicitly every past-half hour
 		pollSCM('30 * * * *')
 	}
 

+ 3 - 3
doc/doxygen/Makefile.am

@@ -304,8 +304,8 @@ endif
 EXTRA_DIST += doxygen.cfg refman.tex \
 	      $(chapters) $(images)
 
-# Rule to update documentation on web server. Should only be used locally.
-PUBLISHHOST	?= gforge
+# Rule to update documentation on web server. Should only be called from benchmarks dalton directory
+PUBLISHDIR	?= /home/benchmarks/softs/starpu/starpu-scripts/mirror/files/doc
 update-web: $(DOX_PDF)
-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
+	cp -pr starpu.pdf html $(PUBLISHDIR)
 

+ 3 - 3
doc/doxygen/chapters/000_introduction.doxy

@@ -77,9 +77,9 @@ policies in a portable fashion (\ref HowToDefineANewSchedulingPolicy).
 The remainder of this section describes the main concepts used in StarPU.
 
 A video is available on the StarPU website
-http://starpu.gforge.inria.fr/ that presents these concepts in 26 minutes.
+https://starpu.gitlabpages.inria.fr/ that presents these concepts in 26 minutes.
 
-Some tutorials are also available on http://starpu.gforge.inria.fr/tutorials/
+Some tutorials are also available on https://starpu.gitlabpages.inria.fr/tutorials/
 
 // explain the notion of codelet and task (i.e. g(A, B)
 
@@ -190,7 +190,7 @@ unregister it.
 \section ResearchPapers Research Papers
 
 Research papers about StarPU can be found at
-http://starpu.gforge.inria.fr/publications/.
+https://starpu.gitlabpages.inria.fr/publications/.
 
 A good overview is available in the research report at
 http://hal.archives-ouvertes.fr/inria-00467677.

+ 6 - 11
doc/doxygen/chapters/101_building.doxy

@@ -61,27 +61,22 @@ script <c>configure</c>.
 \subsection GettingSources Getting Sources
 
 StarPU's sources can be obtained from the download page of
-the StarPU website (http://starpu.gforge.inria.fr/files/).
+the StarPU website (https://starpu.gitlabpages.inria.fr/files/).
 
 All releases and the development tree of StarPU are freely available
-on Inria's gforge under the LGPL license. Some releases are available
+on StarPU SCM server under the LGPL license. Some releases are available
 under the BSD license.
 
-The latest release can be downloaded from the Inria's gforge (http://gforge.inria.fr/frs/?group_id=1570) or
-directly from the StarPU download page (http://starpu.gforge.inria.fr/files/).
+The latest release can be downloaded from the StarPU download page (https://starpu.gitlabpages.inria.fr/files/).
 
-The latest nightly snapshot can be downloaded from the StarPU gforge website (http://starpu.gforge.inria.fr/testing/).
-
-\verbatim
-$ wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
-\endverbatim
+The latest nightly snapshot can be downloaded from the StarPU website (https://starpu.gitlabpages.inria.fr/files/testing/).
 
 And finally, current development version is also accessible via git.
 It should only be used if you need the very latest changes (i.e. less
 than a day old!).
 
 \verbatim
-$ git clone https://scm.gforge.inria.fr/anonscm/git/starpu/starpu.git
+$ git clone git@gitlab.inria.fr:starpu/starpu.git
 \endverbatim
 
 \subsection ConfiguringStarPU Configuring StarPU
@@ -139,7 +134,7 @@ $ make
 Once everything is built, you may want to test the result. An
 extensive set of regression tests is provided with StarPU. Running the
 tests is done by calling <c>make check</c>. These tests are run every night
-and the result from the main profile is publicly available (http://starpu.gforge.inria.fr/testing/master/).
+and the result from the main profile is publicly available (https://starpu.gitlabpages/files/testing/master/).
 
 \verbatim
 $ make check

+ 10 - 2
doc/doxygen/chapters/410_mpi_support.doxy

@@ -759,8 +759,16 @@ add fine-graph starpu_mpi_cache_flush() calls during the algorithm; the effect
 for the data deallocation will be the same, but it will additionally release some
 pressure from the StarPU-MPI cache hash table during task submission.
 
-One can determine whether a piece of is cached with starpu_mpi_cached_receive()
-and starpu_mpi_cached_send().
+One can determine whether a piece of data is cached with
+starpu_mpi_cached_receive() and starpu_mpi_cached_send().
+
+Functions starpu_mpi_cached_receive_set() and
+starpu_mpi_cached_send_set() are automatically called by
+starpu_mpi_task_insert() but can also be called directly by the
+application. Functions starpu_mpi_cached_send_clear() and
+starpu_mpi_cached_receive_clear() must be called to clear data from
+the cache. They are also automatically called when using
+starpu_mpi_task_insert().
 
 The whole caching behavior can be disabled thanks to the \ref STARPU_MPI_CACHE
 environment variable. The variable \ref STARPU_MPI_CACHE_STATS can be set to <c>1</c>

+ 3 - 3
doc/doxygen_dev/Makefile.am

@@ -245,8 +245,8 @@ endif
 EXTRA_DIST += doxygen.cfg refman.tex \
 	      $(chapters) $(images)
 
-# Rule to update documentation on web server. Should only be used locally.
-PUBLISHHOST	?= gforge
+# Rule to update documentation on web server. Should only be called from benchmarks dalton directory
+PUBLISHDIR	?= /home/benchmarks/softs/starpu/starpu-scripts/mirror/files/doc
 update-web: $(DOX_PDF)
-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
+	cp -pr starpu_dev.pdf html_dev $(PUBLISHDIR)
 

+ 4 - 3
include/starpu_task.h

@@ -151,8 +151,9 @@ enum starpu_codelet_type
 
 enum starpu_task_status
 {
-	STARPU_TASK_INVALID,     /**< The task has just been initialized. */
-#define STARPU_TASK_INVALID 0
+	STARPU_TASK_INIT,        /**< The task has just been initialized. */
+#define STARPU_TASK_INIT 0
+#define STARPU_TASK_INVALID STARPU_TASK_INIT  /**< old name for STARPU_TASK_INIT */
 	STARPU_TASK_BLOCKED,     /**< The task has just been
 				    submitted, and its dependencies has not been checked yet. */
 	STARPU_TASK_READY,       /**< The task is ready for execution. */
@@ -1295,7 +1296,7 @@ struct starpu_task
 	.detach = 1,					\
 	.destroy = 0,					\
 	.regenerate = 0,				\
-	.status = STARPU_TASK_INVALID,			\
+	.status = STARPU_TASK_INIT,			\
 	.profiling_info = NULL,				\
 	.predicted = NAN,				\
 	.predicted_transfer = NAN,			\

+ 11 - 0
mpi/examples/Makefile.am

@@ -128,6 +128,17 @@ starpu_mpi_EXAMPLES	+=	\
 endif
 
 ##################
+# Cache examples #
+##################
+examplebin_PROGRAMS +=		\
+	cache/cache		\
+	cache/cache_disable
+starpu_mpi_EXAMPLES +=		\
+	cache/cache		\
+	cache/cache_disable
+
+
+##################
 # MPI LU example #
 ##################
 

+ 1 - 2
mpi/tests/cache.c

@@ -17,7 +17,6 @@
 #include <starpu_mpi.h>
 #include <math.h>
 #include "helper.h"
-#include <starpu_mpi_cache.h>
 
 void func_cpu(void *descr[], void *_args)
 {
@@ -57,7 +56,7 @@ void test(struct starpu_codelet *codelet, enum starpu_data_access_mode mode, sta
 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, codelet, mode, data, STARPU_EXECUTE_ON_NODE, 1, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 
-	cache = _starpu_mpi_cache_received_data_get(data);
+	cache = starpu_mpi_cached_receive(data);
 
 	if (rank == 1)
 	{

+ 3 - 4
mpi/tests/cache_disable.c

@@ -17,7 +17,6 @@
 #include <starpu_mpi.h>
 #include <math.h>
 #include "helper.h"
-#include <starpu_mpi_cache.h>
 
 void func_cpu(void *descr[], void *_args)
 {
@@ -63,7 +62,7 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 
-	in_cache = _starpu_mpi_cache_received_data_get(data);
+	in_cache = starpu_mpi_cached_receive(data);
 	if (rank == 1)
 	{
 		STARPU_ASSERT_MSG(in_cache == 1, "Data should be in cache\n");
@@ -73,7 +72,7 @@ int main(int argc, char **argv)
 	starpu_mpi_cache_set(0);
 
 	// We check the data is no longer in the cache
-	in_cache = _starpu_mpi_cache_received_data_get(data);
+	in_cache = starpu_mpi_cached_receive(data);
 	if (rank == 1)
 	{
 		STARPU_ASSERT_MSG(in_cache == 0, "Data should NOT be in cache\n");
@@ -81,7 +80,7 @@ int main(int argc, char **argv)
 
 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
-	in_cache = _starpu_mpi_cache_received_data_get(data);
+	in_cache = starpu_mpi_cached_receive(data);
 	if (rank == 1)
 	{
 		STARPU_ASSERT_MSG(in_cache == 0, "Data should NOT be in cache\n");

+ 27 - 0
mpi/include/starpu_mpi.h

@@ -422,12 +422,39 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
 int starpu_mpi_cached_receive(starpu_data_handle_t data_handle);
 
 /**
+ * If \p data is already available in the reception cache, return 1
+ * If \p data is NOT available in the reception cache, add it to the
+ * cache and return 0
+ * Return 0 if the communication cache is not enabled
+ */
+int starpu_mpi_cached_receive_set(starpu_data_handle_t data);
+
+/**
+ * Remove \p data from the reception cache
+ */
+void starpu_mpi_cached_receive_clear(starpu_data_handle_t data);
+
+/**
    Test whether \p data_handle is cached for emission to node \p dest,
    i.e. the value was previously sent to \p dest, and not flushed
    since then.
 */
 int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest);
 
+/**
+ * If \p data is already available in the emission cache for node
+ * \p dest, return 1
+ * If \p data is NOT available in the emission cache for node \p dest,
+ * add it to the cache and return 0
+ * Return 0 if the communication cache is not enabled
+ */
+int starpu_mpi_cached_send_set(starpu_data_handle_t data, int dest);
+
+/**
+ * Remove \p data from the emission cache
+ */
+void starpu_mpi_cached_send_clear(starpu_data_handle_t data);
+
 /** @} */
 
 /**

+ 4 - 4
mpi/src/starpu_mpi.c

@@ -346,7 +346,7 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 	if (me == node)
 	{
 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
-		int already_received = _starpu_mpi_cache_received_data_set(data_handle);
+		int already_received = starpu_mpi_cached_receive_set(data_handle);
 		if (already_received == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
@@ -356,7 +356,7 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 	else if (me == rank)
 	{
 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
-		int already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
+		int already_sent = starpu_mpi_cached_send_set(data_handle, node);
 		if (already_sent == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
@@ -389,7 +389,7 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	{
 		MPI_Status status;
 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
-		int already_received = _starpu_mpi_cache_received_data_set(data_handle);
+		int already_received = starpu_mpi_cached_receive_set(data_handle);
 		if (already_received == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
@@ -399,7 +399,7 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	else if (me == rank)
 	{
 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
-		int already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
+		int already_sent = starpu_mpi_cached_send_set(data_handle, node);
 		if (already_sent == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);

+ 6 - 16
mpi/src/starpu_mpi_cache.c

@@ -172,7 +172,7 @@ static void _starpu_mpi_cache_data_remove_nolock(starpu_data_handle_t data_handl
 /**************************************
  * Received cache
  **************************************/
-void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
+void starpu_mpi_cached_receive_clear(starpu_data_handle_t data_handle)
 {
 	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
@@ -198,7 +198,7 @@ void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
 }
 
-int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
+int starpu_mpi_cached_receive_set(starpu_data_handle_t data_handle)
 {
 	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
@@ -226,7 +226,7 @@ int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
 	return already_received;
 }
 
-int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data_handle)
+int starpu_mpi_cached_receive(starpu_data_handle_t data_handle)
 {
 	int already_received;
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
@@ -241,15 +241,10 @@ int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data_handle)
 	return already_received;
 }
 
-int starpu_mpi_cached_receive(starpu_data_handle_t data_handle)
-{
-	return _starpu_mpi_cache_received_data_get(data_handle);
-}
-
 /**************************************
  * Send cache
  **************************************/
-void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
+void starpu_mpi_cached_send_clear(starpu_data_handle_t data_handle)
 {
 	int n, size;
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
@@ -271,7 +266,7 @@ void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
 }
 
-int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
+int starpu_mpi_cached_send_set(starpu_data_handle_t data_handle, int dest)
 {
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
@@ -296,7 +291,7 @@ int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
 	return already_sent;
 }
 
-int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data_handle, int dest)
+int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
 {
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 	int already_sent;
@@ -311,11 +306,6 @@ int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data_handle, int dest)
 	return already_sent;
 }
 
-int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
-{
-	return _starpu_mpi_cache_sent_data_get(data_handle, dest);
-}
-
 static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle)
 {
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;

+ 0 - 16
mpi/src/starpu_mpi_cache.h

@@ -32,22 +32,6 @@ void _starpu_mpi_cache_shutdown();
 void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle);
 void _starpu_mpi_cache_data_clear(starpu_data_handle_t data_handle);
 
-/*
- * If the data is already available in the cache, return a pointer to the data
- * If the data is NOT available in the cache, add it to the cache and return NULL
- */
-int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data);
-int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data);
-void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data);
-
-/*
- * If the data is already available in the cache, return a pointer to the data
- * If the data is NOT available in the cache, add it to the cache and return NULL
- */
-int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest);
-int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data, int dest);
-void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data);
-
 void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle);
 
 #ifdef __cplusplus

+ 4 - 4
mpi/src/starpu_mpi_task_insert.c

@@ -112,7 +112,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 		if (do_execute && mpi_rank != STARPU_MPI_PER_NODE && mpi_rank != me)
 		{
 			/* The node is going to execute the codelet, but it does not own the data, it needs to receive the data from the owner node */
-			int already_received = _starpu_mpi_cache_received_data_set(data);
+			int already_received = starpu_mpi_cached_receive_set(data);
 			if (already_received == 0)
 			{
 				if (data_tag == -1)
@@ -126,7 +126,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 		if (!do_execute && mpi_rank == me)
 		{
 			/* The node owns the data, but another node is going to execute the codelet, the node needs to send the data to the executee node. */
-			int already_sent = _starpu_mpi_cache_sent_data_set(data, xrank);
+			int already_sent = starpu_mpi_cached_send_set(data, xrank);
 			if (already_sent == 0)
 			{
 				if (data_tag == -1)
@@ -182,8 +182,8 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 		if (mode & STARPU_W || mode & STARPU_REDUX)
 		{
 			/* The data has been modified, it MUST be removed from the cache */
-			_starpu_mpi_cache_sent_data_clear(data);
-			_starpu_mpi_cache_received_data_clear(data);
+			starpu_mpi_cached_send_clear(data);
+			starpu_mpi_cached_receive_clear(data);
 		}
 	}
 	else

+ 0 - 4
mpi/tests/Makefile.am

@@ -96,8 +96,6 @@ starpu_mpi_TESTS =
 
 starpu_mpi_TESTS +=				\
 	broadcast				\
-	cache					\
-	cache_disable				\
 	callback				\
 	driver					\
 	early_request				\
@@ -192,8 +190,6 @@ noinst_PROGRAMS +=				\
 	block_interface_pinned			\
 	attr					\
 	broadcast				\
-	cache					\
-	cache_disable				\
 	callback				\
 	matrix					\
 	matrix2					\

+ 21 - 0
src/core/dependencies/tags.c

@@ -63,10 +63,21 @@ static struct _starpu_cg *create_cg_tag(unsigned ntags, struct _starpu_tag *tag)
 
 	cg->ntags = ntags;
 	cg->remaining = ntags;
+#ifdef STARPU_DEBUG
+	cg->ndeps = ntags;
+	cg->deps = NULL;
+	cg->done = NULL;
+#endif
 	cg->cg_type = STARPU_CG_TAG;
 
 	cg->succ.tag = tag;
 	tag->tag_successors.ndeps++;
+#ifdef STARPU_DEBUG
+	_STARPU_REALLOC(tag->tag_successors.deps, tag->tag_successors.ndeps * sizeof(tag->tag_successors.deps[0]));
+	_STARPU_REALLOC(tag->tag_successors.done, tag->tag_successors.ndeps * sizeof(tag->tag_successors.done[0]));
+	tag->tag_successors.deps[tag->tag_successors.ndeps-1] = cg;
+	tag->tag_successors.done[tag->tag_successors.ndeps-1] = 0;
+#endif
 
 	return cg;
 }
@@ -364,10 +375,20 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 	struct _starpu_cg *cg = create_cg_tag(ndeps, tag_child);
 	_starpu_spin_unlock(&tag_child->lock);
 
+#ifdef STARPU_DEBUG
+	_STARPU_MALLOC(cg->deps, ndeps * sizeof(cg->deps[0]));
+	_STARPU_MALLOC(cg->done, ndeps * sizeof(cg->done[0]));
+#endif
+
 	for (i = 0; i < ndeps; i++)
 	{
 		starpu_tag_t dep_id = array[i];
 
+#ifdef STARPU_DEBUG
+		cg->deps[i] = (void*) (uintptr_t) dep_id;
+		cg->done[i] = 0;
+#endif
+
 		/* id depends on dep_id
 		 * so cg should be among dep_id's successors*/
 		_STARPU_TRACE_TAG_DEPS(id, dep_id);

+ 1 - 1
src/core/simgrid.c

@@ -723,7 +723,7 @@ void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_j
 	{
 		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
-				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
+				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated, or fix the STARPU_HOSTNAME and STARPU_PERF_MODEL_DIR environment variables",
 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
                 /* TODO: option to add variance according to performance model,
                  * to be able to easily check scheduling robustness */

+ 6 - 6
src/core/task.c

@@ -288,8 +288,8 @@ void starpu_task_init(struct starpu_task *task)
 
 	task->detach = 1;
 
-#if STARPU_TASK_INVALID != 0
-	task->status = STARPU_TASK_INVALID;
+#if STARPU_TASK_INIT != 0
+	task->status = STARPU_TASK_INIT;
 #endif
 
 	task->predicted = NAN;
@@ -766,9 +766,9 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 
 	if (task->status == STARPU_TASK_STOPPED || task->status == STARPU_TASK_FINISHED)
-		task->status = STARPU_TASK_INVALID;
+		task->status = STARPU_TASK_INIT;
 	else
-		STARPU_ASSERT(task->status == STARPU_TASK_INVALID);
+		STARPU_ASSERT(task->status == STARPU_TASK_INIT);
 
 	if (j->internal)
 	{
@@ -1067,7 +1067,7 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 	_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops, j->task);
 	_starpu_job_set_ordered_buffers(j);
 
-	STARPU_ASSERT(task->status == STARPU_TASK_INVALID);
+	STARPU_ASSERT(task->status == STARPU_TASK_INIT);
 	task->status = STARPU_TASK_READY;
 	_starpu_profiling_set_task_push_start_time(task);
 
@@ -1668,7 +1668,7 @@ struct starpu_task *starpu_task_ft_create_retry
 	new_task->failed = 0;
 	new_task->scheduled = 0;
 	new_task->prefetched = 0;
-	new_task->status = STARPU_TASK_INVALID;
+	new_task->status = STARPU_TASK_INIT;
 	new_task->profiling_info = NULL;
 	new_task->prev = NULL;
 	new_task->next = NULL;

+ 1 - 1
src/core/task_bundle.c

@@ -50,7 +50,7 @@ int starpu_task_bundle_insert(starpu_task_bundle_t bundle, struct starpu_task *t
 		return -EPERM;
 	}
 
-	if (task->status != STARPU_TASK_INVALID)
+	if (task->status != STARPU_TASK_INIT)
 	{
 		/* The task has already been submitted, it's too late to put it
 		 * into a bundle now. */

+ 1 - 1
src/datawizard/user_interactions.c

@@ -227,7 +227,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_dat
 			*pre_sync_jobid = pre_sync_job->job_id;
 
 		wrapper->post_sync_task = starpu_task_create();
-		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_post";
+		wrapper->post_sync_task->name = "_starpu_data_acquire_cb_release";
 		wrapper->post_sync_task->detach = 1;
 		wrapper->post_sync_task->type = STARPU_TASK_TYPE_DATA_ACQUIRE;
 		post_sync_job = _starpu_get_job_associated_to_task(wrapper->post_sync_task);

+ 85 - 195
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -443,151 +443,6 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	return ret;
 }
 
-/* TODO: factorize with dmda!! */
-static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
-{
-	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-	int best = -1;
-
-	double best_exp_end_of_task = 0.0;
-	double model_best = 0.0;
-	double transfer_model_best = 0.0;
-
-	int ntasks_best = -1;
-	double ntasks_best_end = 0.0;
-	int calibrating = 0;
-
-	/* A priori, we know all estimations */
-	int unknown = 0;
-
-	unsigned best_impl = 0;
-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
-
-	struct starpu_sched_ctx_iterator it;
-
-	double now = starpu_timing_now();
-
-	workers->init_iterator_for_parallel_tasks(workers, &it, task);
-	while(workers->has_next(workers, &it))
-	{
-		unsigned nimpl;
-		unsigned impl_mask;
-		unsigned worker = workers->get_next(workers, &it);
-		struct _starpu_fifo_taskq *fifo  = &dt->queue_array[worker];
-		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
-
-		/* Sometimes workers didn't take the tasks as early as we expected */
-		double exp_start = isnan(fifo->exp_start) ? now + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, now);
-
-		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
-			continue;
-
-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-		{
-			if (!(impl_mask & (1U << nimpl)))
-			{
-				/* no one on that queue may execute this task */
-				continue;
-			}
-
-			double exp_end;
-			double local_length = starpu_task_worker_expected_length(task, worker, sched_ctx_id, nimpl);
-			double local_penalty = starpu_task_expected_data_transfer_time_for(task, worker);
-			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
-
-			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
-
-			/*
-			 * This implements a default greedy scheduler for the
-			 * case of tasks which have no performance model, or
-			 * whose performance model is not calibrated yet.
-			 *
-			 * It simply uses the number of tasks already pushed to
-			 * the workers, divided by the relative performance of
-			 * a CPU and of a GPU.
-			 *
-			 * This is always computed, but the ntasks_best
-			 * selection is only really used if the task indeed has
-			 * no performance model, or is not calibrated yet.
-			 */
-			if (ntasks_best == -1
-
-			    /* Always compute the greedy decision, at least for
-			     * the tasks with no performance model. */
-			    || (!calibrating && ntasks_end < ntasks_best_end)
-
-			    /* The performance model of this task is not
-			     * calibrated on this worker, try to run it there
-			     * to calibrate it there. */
-			    || (!calibrating && isnan(local_length))
-
-			    /* the performance model of this task is not
-			     * calibrated on this worker either, rather run it
-			     * there if this one is low on scheduled tasks. */
-			    || (calibrating && isnan(local_length) && ntasks_end < ntasks_best_end)
-				)
-			{
-				ntasks_best_end = ntasks_end;
-				ntasks_best = worker;
-				best_impl = nimpl;
-			}
-
-			if (isnan(local_length))
-			{
-				/* we are calibrating, we want to speed-up calibration time
-				 * so we privilege non-calibrated tasks (but still
-				 * greedily distribute them to avoid dumb schedules) */
-				static int warned;
-				if (!warned)
-				{
-					warned = 1;
-					_STARPU_DISP("Warning: performance model for %s not finished calibrating on worker %u, using a dumb scheduling heuristic for now\n", starpu_task_get_name(task), worker);
-				}
-				calibrating = 1;
-			}
-
-			if (isnan(local_length) || _STARPU_IS_ZERO(local_length))
-				/* there is no prediction available for that task
-				 * with that arch yet, so switch to a greedy strategy */
-				unknown = 1;
-
-			if (unknown)
-				continue;
-
-			exp_end = exp_start + fifo->exp_len + local_length;
-
-			if (best == -1 || exp_end < best_exp_end_of_task)
-			{
-				/* a better solution was found */
-				best_exp_end_of_task = exp_end;
-				best = worker;
-				model_best = local_length;
-				transfer_model_best = local_penalty;
-				best_impl = nimpl;
-			}
-		}
-	}
-
-	if (unknown)
-	{
-		best = ntasks_best;
-		model_best = 0.0;
-		transfer_model_best = 0.0;
-#ifdef STARPU_VERBOSE
-		dt->eager_task_cnt++;
-#endif
-	}
-
-	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
-
-	starpu_task_set_implementation(task, best_impl);
-
-	starpu_sched_task_break(task);
-	/* we should now have the best worker in variable "best" */
-	return push_task_on_best_worker(task, best,
-					model_best, transfer_model_best, prio, sched_ctx_id);
-}
-
 /* TODO: factorise CPU computations, expensive with a lot of cores */
 static void compute_all_performance_predictions(struct starpu_task *task,
 						unsigned nworkers,
@@ -677,15 +532,19 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			{
 				/* TODO : conversion time */
 				local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
-				local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
-				local_energy[worker_ctx][nimpl] = starpu_task_bundle_expected_energy(bundle, perf_arch,nimpl);
+				if (local_data_penalty)
+					local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
+				if (local_energy)
+					local_energy[worker_ctx][nimpl] = starpu_task_bundle_expected_energy(bundle, perf_arch,nimpl);
 
 			}
 			else
 			{
 				local_task_length[worker_ctx][nimpl] = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, nimpl);
-				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
-				local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
+				if (local_data_penalty)
+					local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
+				if (local_energy)
+					local_energy[worker_ctx][nimpl] = starpu_task_worker_expected_energy(task, workerid, sched_ctx_id,nimpl);
 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
 				if (conversion_time > 0.0)
 					local_task_length[worker_ctx][nimpl] += conversion_time;
@@ -742,7 +601,10 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			if (unknown)
 				continue;
 
-			double task_starting_time = STARPU_MAX(exp_start + prev_exp_len, now + local_data_penalty[worker_ctx][nimpl]); 
+			double task_starting_time = exp_start + prev_exp_len;
+			if (local_data_penalty)
+				task_starting_time = STARPU_MAX(task_starting_time,
+					now + local_data_penalty[worker_ctx][nimpl]);
 
 			exp_end[worker_ctx][nimpl] = task_starting_time + local_task_length[worker_ctx][nimpl];
 
@@ -753,8 +615,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 				nimpl_best = nimpl;
 			}
 
-			if (isnan(local_energy[worker_ctx][nimpl]))
-				local_energy[worker_ctx][nimpl] = 0.;
+			if (local_energy)
+				if (isnan(local_energy[worker_ctx][nimpl]))
+					local_energy[worker_ctx][nimpl] = 0.;
 
 		}
 		worker_ctx++;
@@ -774,7 +637,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	*max_exp_endp_of_workers = max_exp_end_of_workers;
 }
 
-static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id, unsigned simulate, unsigned sorted_decision)
+static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id, unsigned da, unsigned simulate, unsigned sorted_decision)
 {
 	/* find the queue */
 	int best = -1, best_in_ctx = -1;
@@ -812,8 +675,8 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 					    exp_end,
 					    &max_exp_end_of_workers,
 					    &min_exp_end_of_task,
-					    local_data_penalty,
-					    local_energy,
+					    da ? local_data_penalty : NULL,
+					    da ? local_energy : NULL,
 					    &forced_best,
 					    &forced_impl, sched_ctx_id, sorted_decision);
 
@@ -840,11 +703,14 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 					/* no one on that queue may execute this task */
 					continue;
 				}
-				fitness[worker_ctx][nimpl] = dt->alpha * __s_alpha__value *(exp_end[worker_ctx][nimpl] - min_exp_end_of_task)
-					+ dt->beta * __s_beta__value *(local_data_penalty[worker_ctx][nimpl])
-					+ dt->_gamma * __s_gamma__value *(local_energy[worker_ctx][nimpl]);
+				if (da)
+					fitness[worker_ctx][nimpl] = dt->alpha * __s_alpha__value *(exp_end[worker_ctx][nimpl] - min_exp_end_of_task)
+						+ dt->beta * __s_beta__value *(local_data_penalty[worker_ctx][nimpl])
+						+ dt->_gamma * __s_gamma__value *(local_energy[worker_ctx][nimpl]);
+				else
+					fitness[worker_ctx][nimpl] = exp_end[worker_ctx][nimpl] - min_exp_end_of_task;
 
-				if (exp_end[worker_ctx][nimpl] > max_exp_end_of_workers)
+				if (da && exp_end[worker_ctx][nimpl] > max_exp_end_of_workers)
 				{
 					/* This placement will make the computation
 					 * longer, take into account the idle
@@ -886,15 +752,17 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx, sched_ctx_id);
 		unsigned memory_node = starpu_worker_get_memory_node(best);
 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
-		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
+		if (da)
+			transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
 	}
 	else
 	{
 		model_best = local_task_length[best_in_ctx][selected_impl];
-		transfer_model_best = local_data_penalty[best_in_ctx][selected_impl];
+		if (da)
+			transfer_model_best = local_data_penalty[best_in_ctx][selected_impl];
 	}
 
-	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
+	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", selected_impl);
 	starpu_task_set_implementation(task, selected_impl);
 
 	starpu_sched_task_break(task);
@@ -911,7 +779,7 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
 static int dmda_push_sorted_decision_task(struct starpu_task *task)
 {
-	return _dmda_push_task(task, 1, task->sched_ctx, 0, 1);
+	return _dmda_push_task(task, 1, task->sched_ctx, 1, 0, 1);
 }
 
 static int dmda_push_sorted_task(struct starpu_task *task)
@@ -919,35 +787,40 @@ static int dmda_push_sorted_task(struct starpu_task *task)
 #ifdef STARPU_DEVEL
 #warning TODO: after defining a scheduling window, use that instead of empty_ctx_tasks
 #endif
-	return _dmda_push_task(task, 1, task->sched_ctx, 0, 0);
+	return _dmda_push_task(task, 1, task->sched_ctx, 1, 0, 0);
 }
 
 static int dm_push_task(struct starpu_task *task)
 {
-	return _dm_push_task(task, 0, task->sched_ctx);
+	return _dmda_push_task(task, 0, task->sched_ctx, 0, 0, 0);
+}
+
+static double dm_simulate_push_task(struct starpu_task *task)
+{
+	return _dmda_push_task(task, 0, task->sched_ctx, 0, 1, 0);
 }
 
 static int dmda_push_task(struct starpu_task *task)
 {
 	STARPU_ASSERT(task);
-	return _dmda_push_task(task, 0, task->sched_ctx, 0, 0);
+	return _dmda_push_task(task, 0, task->sched_ctx, 1, 0, 0);
 }
 static double dmda_simulate_push_task(struct starpu_task *task)
 {
 	STARPU_ASSERT(task);
-	return _dmda_push_task(task, 0, task->sched_ctx, 1, 0);
+	return _dmda_push_task(task, 0, task->sched_ctx, 1, 1, 0);
 }
 
 static double dmda_simulate_push_sorted_task(struct starpu_task *task)
 {
 	STARPU_ASSERT(task);
-	return _dmda_push_task(task, 1, task->sched_ctx, 1, 0);
+	return _dmda_push_task(task, 1, task->sched_ctx, 1, 1, 0);
 }
 
 static double dmda_simulate_push_sorted_decision_task(struct starpu_task *task)
 {
 	STARPU_ASSERT(task);
-	return _dmda_push_task(task, 1, task->sched_ctx, 1, 1);
+	return _dmda_push_task(task, 1, task->sched_ctx, 1, 1, 1);
 }
 
 #ifdef NOTIFY_READY_SOON
@@ -1092,7 +965,7 @@ static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 	starpu_worker_unlock_self();
 }
 
-static void dmda_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id)
+static void _dm_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id, int da)
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = &dt->queue_array[workerid];
@@ -1100,8 +973,11 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 	/* Compute the expected penality */
 	double predicted = starpu_task_worker_expected_length(task, perf_workerid, STARPU_NMAX_SCHED_CTXS,
 						       starpu_task_get_implementation(task));
+	double predicted_transfer = NAN;
+
+	if (da)
+		predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);
 
-	double predicted_transfer = starpu_task_expected_data_transfer_time_for(task, workerid);
 	double now = starpu_timing_now();
 
 	/* Update the predictions */
@@ -1110,32 +986,35 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 	fifo->exp_start = isnan(fifo->exp_start) ? now + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, now);
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
-	/* If there is no prediction available, we consider the task has a null length */
-	if (!isnan(predicted_transfer))
+	if (da)
 	{
-		if (now + predicted_transfer < fifo->exp_end)
+		/* If there is no prediction available, we consider the task has a null length */
+		if (!isnan(predicted_transfer))
 		{
-			/* We may hope that the transfer will be finished by
-			 * the start of the task. */
-			predicted_transfer = 0;
-		}
-		else
-		{
-			/* The transfer will not be finished by then, take the
-			 * remainder into account */
-			predicted_transfer = (now + predicted_transfer) - fifo->exp_end;
-		}
-		task->predicted_transfer = predicted_transfer;
-		fifo->exp_end += predicted_transfer;
-		fifo->exp_len += predicted_transfer;
-		if(dt->num_priorities != -1)
-		{
-			int i;
-			int task_prio = _starpu_normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
-			for(i = 0; i <= task_prio; i++)
-				fifo->exp_len_per_priority[i] += predicted_transfer;
-		}
+			if (now + predicted_transfer < fifo->exp_end)
+			{
+				/* We may hope that the transfer will be finished by
+				 * the start of the task. */
+				predicted_transfer = 0;
+			}
+			else
+			{
+				/* The transfer will not be finished by then, take the
+				 * remainder into account */
+				predicted_transfer = (now + predicted_transfer) - fifo->exp_end;
+			}
+			task->predicted_transfer = predicted_transfer;
+			fifo->exp_end += predicted_transfer;
+			fifo->exp_len += predicted_transfer;
+			if(dt->num_priorities != -1)
+			{
+				int i;
+				int task_prio = _starpu_normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
+				for(i = 0; i <= task_prio; i++)
+					fifo->exp_len_per_priority[i] += predicted_transfer;
+			}
 
+		}
 	}
 
 	/* If there is no prediction available, we consider the task has a null length */
@@ -1166,6 +1045,16 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 	starpu_worker_unlock(workerid);
 }
 
+static void dm_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id)
+{
+	_dm_push_task_notify(task, workerid, perf_workerid, sched_ctx_id, 0);
+}
+
+static void dmda_push_task_notify(struct starpu_task *task, int workerid, int perf_workerid, unsigned sched_ctx_id)
+{
+	_dm_push_task_notify(task, workerid, perf_workerid, sched_ctx_id, 1);
+}
+
 static void dmda_post_exec_hook(struct starpu_task * task, unsigned sched_ctx_id)
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
@@ -1183,7 +1072,8 @@ struct starpu_sched_policy _starpu_sched_dm_policy =
 	.add_workers = dmda_add_workers ,
 	.remove_workers = dmda_remove_workers,
 	.push_task = dm_push_task,
-	.simulate_push_task = NULL,
+	.simulate_push_task = dm_simulate_push_task,
+	.push_task_notify = dm_push_task_notify,
 	.pop_task = dmda_pop_task,
 	.pre_exec_hook = dmda_pre_exec_hook,
 	.post_exec_hook = dmda_post_exec_hook,

+ 28 - 3
tools/gdbinit

@@ -40,7 +40,7 @@ define starpu-print-task
   set $job = (struct _starpu_job *)$task->starpu_private
   set $status=0
   if $task->status == 0
-    set $status="STARPU_TASK_INVALID"
+    set $status="STARPU_TASK_INIT"
   end
   if $task->status == 1
     set $status="STARPU_TASK_BLOCKED"
@@ -98,10 +98,11 @@ define starpu-print-task
       if ! $job->job_successors->done[$n]
         set $cg = $job->job_successors->deps[$n]
         set $m = 0
+        printf "\t\tcg:\t\t\t<%u>\n", $cg->ndeps
 	while $m < $cg->ndeps
 	  if ! $cg->done[$m]
 	    set $depj = (struct _starpu_job *) $cg->deps[$m]
-            printf "\t\ttask %p\n", $depj->task
+            printf "\t\t\ttask %p\n", $depj->task
 	  end
 	  set $m = $m + 1
 	end
@@ -219,7 +220,31 @@ define starpu-print-tag
   if $tag_struct->state == STARPU_DONE
      set $status="STARPU_DONE"
   end
-  printf "tag %d state %s\n", $arg0, $status
+  printf "tag %d ((struct _starpu_tag *) %p)\n", $arg0, $tag_struct
+  printf "\tstate %s\n", $status
+  printf "\tdeps %d\n", $tag_struct->tag_successors.ndeps
+  printf "\tcompleted %d\n", $tag_struct->tag_successors.ndeps_completed
+  printf "\tndeps_remaining:\t\t<%u>\n", $tag_struct->tag_successors->ndeps - $tag_struct->tag_successors->ndeps_completed
+  if _starpu_debug
+    set $n = 0
+    while $n < $tag_struct->tag_successors->ndeps
+      if ! $tag_struct->tag_successors->done[$n]
+        set $cg = $tag_struct->tag_successors->deps[$n]
+        set $m = 0
+        printf "\t\tcg:\t\t\t<%u>\n", $cg->ndeps
+	while $m < $cg->ndeps
+	  if ! $cg->done[$m]
+	    set $dept = (starpu_tag_t) $cg->deps[$m]
+            printf "\t\t\ttag %u\n", $dept
+	  end
+	  set $m = $m + 1
+	end
+      end
+      set $n = $n + 1
+    end
+  end
+  printf "\tndeps_completed:\t\t<%u>\n", $tag_struct->tag_successors->ndeps_completed
+  printf "\tnsuccs:\t\t\t\t<%u>\n", $tag_struct->tag_successors->nsuccs
 end
 
 define starpu-tags