浏览代码

nmad/src: bring up modifications from native mpi implementation

Nathalie Furmento 8 年之前
父节点
当前提交
36d56bdec7

+ 34 - 6
nmad/src/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2012  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,31 @@ CLEANFILES = *.gcno *.gcda *.linkinfo
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) $(NMAD_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS) $(NMAD_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(NMAD_LDFLAGS)
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(NMAD_LDFLAGS)
+
+ldflags =
+
+if STARPU_HAVE_WINDOWS
+
+LC_MESSAGES=C
+export LC_MESSAGES
+
+ldflags += -Xlinker --output-def -Xlinker .libs/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.def
+
+if STARPU_HAVE_MS_LIB
+.libs/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.lib: libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la dolib
+	./dolib "$(STARPU_MS_LIB)" $(STARPU_MS_LIB_ARCH) .libs/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.def @STARPU_EFFECTIVE_VERSION@ $(libstarpumpi_so_version) .libs/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.lib
+all-local: .libs/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.lib
+endif STARPU_HAVE_MS_LIB
+
+install-exec-hook:
+	$(INSTALL) .libs/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.def $(DESTDIR)$(libdir)
+if STARPU_HAVE_MS_LIB
+	$(INSTALL) .libs/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.lib $(DESTDIR)$(libdir)
+	$(INSTALL) .libs/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.exp $(DESTDIR)$(libdir)
+endif STARPU_HAVE_MS_LIB
+
+endif STARPU_HAVE_WINDOWS
 
 lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
@@ -38,12 +62,13 @@ noinst_HEADERS =					\
 	starpu_mpi_stats.h				\
 	starpu_mpi_datatype.h				\
 	starpu_mpi_cache.h				\
+	starpu_mpi_select_node.h			\
 	starpu_mpi_cache_stats.h			\
-	starpu_mpi_collective.c				\
-	starpu_mpi_select_node.h
+	starpu_mpi_task_insert.h			\
+	starpu_mpi_init.h
 
 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
-	starpu_mpi.c					\
+	starpu_mpi_nmad.c				\
 	starpu_mpi_helper.c				\
 	starpu_mpi_datatype.c				\
 	starpu_mpi_task_insert.c			\
@@ -52,7 +77,10 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi_private.c				\
 	starpu_mpi_cache.c				\
 	starpu_mpi_select_node.c			\
-	starpu_mpi_cache_stats.c
+	starpu_mpi_cache_stats.c			\
+	starpu_mpi_fortran.c				\
+	starpu_mpi_task_insert_fortran.c		\
+	starpu_mpi_init.c
 
 showcheck:
 	-cat /dev/null

+ 256 - 150
nmad/src/starpu_mpi_cache.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
- * Copyright (C) 2011-2014  Université de Bordeaux
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
+ * Copyright (C) 2011-2017  Université de Bordeaux
  * Copyright (C) 2014 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,12 +28,16 @@
 struct _starpu_data_entry
 {
 	UT_hash_handle hh;
-	void *data;
+	starpu_data_handle_t data_handle;
 };
 
-static struct _starpu_data_entry **_cache_sent_data = NULL;
-static struct _starpu_data_entry **_cache_received_data = NULL;
+static starpu_pthread_mutex_t _cache_mutex;
+static struct _starpu_data_entry *_cache_data = NULL;
 int _starpu_cache_enabled=1;
+static MPI_Comm _starpu_cache_comm;
+static int _starpu_cache_comm_size;
+
+static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle);
 
 int starpu_mpi_cache_is_enabled()
 {
@@ -51,10 +55,8 @@ int starpu_mpi_cache_set(int enabled)
 		if (_starpu_cache_enabled)
 		{
 			// We need to clean the cache
-			int world_size;
-			starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
-			MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-			_starpu_mpi_cache_free(world_size);
+			starpu_mpi_cache_flush_all_data(_starpu_cache_comm);
+			_starpu_mpi_cache_shutdown();
 		}
 		_starpu_cache_enabled = 0;
 	}
@@ -63,9 +65,6 @@ int starpu_mpi_cache_set(int enabled)
 
 void _starpu_mpi_cache_init(MPI_Comm comm)
 {
-	int nb_nodes;
-	int i;
-
 	_starpu_cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
 	if (_starpu_cache_enabled == -1)
 	{
@@ -74,219 +73,326 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 
 	if (_starpu_cache_enabled == 0)
 	{
-		if (!_starpu_silent) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
+		_STARPU_DISP("Warning: StarPU MPI Communication cache is disabled\n");
 		return;
 	}
 
-	MPI_Comm_size(comm, &nb_nodes);
-	_STARPU_MPI_DEBUG(2, "Initialising htable for cache\n");
-	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-	for(i=0 ; i<nb_nodes ; i++) _cache_sent_data[i] = NULL;
-	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
-	_starpu_mpi_cache_stats_init(comm);
+	_starpu_cache_comm = comm;
+	starpu_mpi_comm_size(comm, &_starpu_cache_comm_size);
+	_starpu_mpi_cache_stats_init();
+	STARPU_PTHREAD_MUTEX_INIT(&_cache_mutex, NULL);
 }
 
-static
-void _starpu_mpi_cache_empty_tables(int world_size)
+void _starpu_mpi_cache_shutdown()
 {
-	int i;
+	if (_starpu_cache_enabled == 0)
+		return;
 
-	if (_starpu_cache_enabled == 0) return;
+	struct _starpu_data_entry *entry, *tmp;
 
-	_STARPU_MPI_DEBUG(2, "Clearing htable for cache\n");
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	HASH_ITER(hh, _cache_data, entry, tmp)
+	{
+		HASH_DEL(_cache_data, entry);
+		free(entry);
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+	STARPU_PTHREAD_MUTEX_DESTROY(&_cache_mutex);
+	free(_cache_data);
+	_starpu_mpi_cache_stats_shutdown();
+}
 
-	for(i=0 ; i<world_size ; i++)
+void _starpu_mpi_cache_data_clear(starpu_data_handle_t data_handle)
+{
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+
+	if (_starpu_cache_enabled == 1)
 	{
-		struct _starpu_data_entry *entry, *tmp;
-		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
-		{
-			HASH_DEL(_cache_sent_data[i], entry);
-			free(entry);
-		}
-		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
+		struct _starpu_data_entry *entry;
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+		_starpu_mpi_cache_flush_nolock(data_handle);
+		HASH_FIND_PTR(_cache_data, &data_handle, entry);
+		if (entry != NULL)
 		{
-			HASH_DEL(_cache_received_data[i], entry);
-			_starpu_mpi_cache_stats_dec(i, (starpu_data_handle_t) entry->data);
+			HASH_DEL(_cache_data, entry);
 			free(entry);
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
 	}
+
+	free(mpi_data->cache_sent);
 }
 
-void _starpu_mpi_cache_free(int world_size)
+void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle)
 {
-	if (_starpu_cache_enabled == 0) return;
+	int i;
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+
+	if (_starpu_cache_enabled == 0)
+		return;
 
-	_starpu_mpi_cache_empty_tables(world_size);
-	free(_cache_sent_data);
-	free(_cache_received_data);
-	_starpu_mpi_cache_stats_free();
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	mpi_data->cache_received = 0;
+	_STARPU_MALLOC(mpi_data->cache_sent, _starpu_cache_comm_size*sizeof(mpi_data->cache_sent[0]));
+	for(i=0 ; i<_starpu_cache_comm_size ; i++)
+	{
+		mpi_data->cache_sent[i] = 0;
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
 }
 
-void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data)
+static void _starpu_mpi_cache_data_add_nolock(starpu_data_handle_t data_handle)
 {
-	int n, size;
-	MPI_Comm comm = ((struct _starpu_mpi_data *) data->mpi_data)->comm;
+	struct _starpu_data_entry *entry;
 
-	MPI_Comm_size(comm, &size);
+	if (_starpu_cache_enabled == 0)
+		return;
 
-	for(n=0 ; n<size ; n++)
+	HASH_FIND_PTR(_cache_data, &data_handle, entry);
+	if (entry == NULL)
 	{
-		struct _starpu_data_entry *already_sent;
-		HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
-		if (already_sent)
-		{
-			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data);
-			HASH_DEL(_cache_sent_data[n], already_sent);
-			free(already_sent);
-		}
+		_STARPU_MPI_MALLOC(entry, sizeof(*entry));
+		entry->data_handle = data_handle;
+		HASH_ADD_PTR(_cache_data, data_handle, entry);
 	}
 }
 
-void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data)
+static void _starpu_mpi_cache_data_remove_nolock(starpu_data_handle_t data_handle)
 {
-	int mpi_rank = starpu_mpi_data_get_rank(data);
-	struct _starpu_data_entry *already_received;
+	struct _starpu_data_entry *entry;
+
+	if (_starpu_cache_enabled == 0)
+		return;
 
-	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
-	if (already_received)
+	HASH_FIND_PTR(_cache_data, &data_handle, entry);
+	if (entry)
+	{
+		HASH_DEL(_cache_data, entry);
+		free(entry);
+	}
+}
+
+/**************************************
+ * Received cache
+ **************************************/
+void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
+{
+	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+
+	if (_starpu_cache_enabled == 0)
+		return;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	STARPU_ASSERT(mpi_data->magic == 42);
+	STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
+
+	if (mpi_data->cache_received == 1)
 	{
 #ifdef STARPU_DEVEL
 #  warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
 #endif
-		_STARPU_MPI_DEBUG(2, "Clearing receive cache for data %p\n", data);
-		HASH_DEL(_cache_received_data[mpi_rank], already_received);
-		_starpu_mpi_cache_stats_dec(mpi_rank, data);
-		free(already_received);
-		starpu_data_invalidate_submit(data);
+		_STARPU_MPI_DEBUG(2, "Clearing receive cache for data %p\n", data_handle);
+		mpi_data->cache_received = 0;
+		starpu_data_invalidate_submit(data_handle);
+		_starpu_mpi_cache_data_remove_nolock(data_handle);
+		_starpu_mpi_cache_stats_dec(mpi_rank, data_handle);
 	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
 }
 
-void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
+int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
 {
-	int nb_nodes, i;
-	int mpi_rank, my_rank;
+	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return 0;
 
-	MPI_Comm_size(comm, &nb_nodes);
-	MPI_Comm_rank(comm, &my_rank);
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	STARPU_ASSERT(mpi_data->magic == 42);
+	STARPU_MPI_ASSERT_MSG(mpi_rank < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", mpi_rank, _starpu_cache_comm_size);
 
-	for(i=0 ; i<nb_nodes ; i++)
+	int already_received = mpi_data->cache_received;
+	if (already_received == 0)
 	{
-		struct _starpu_data_entry *entry, *tmp;
-		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
-		{
-			mpi_rank = starpu_mpi_data_get_rank((starpu_data_handle_t) entry->data);
-			if (mpi_rank != my_rank && mpi_rank != -1)
-				starpu_data_invalidate_submit((starpu_data_handle_t) entry->data);
-			HASH_DEL(_cache_sent_data[i], entry);
-			free(entry);
-		}
-		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
-		{
-			mpi_rank = starpu_mpi_data_get_rank((starpu_data_handle_t) entry->data);
-			if (mpi_rank != my_rank && mpi_rank != -1)
-				starpu_data_invalidate_submit((starpu_data_handle_t) entry->data);
-			HASH_DEL(_cache_received_data[i], entry);
-			_starpu_mpi_cache_stats_dec(i, (starpu_data_handle_t) entry->data);
-			free(entry);
-		}
+		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been received by %d\n", data_handle, mpi_rank);
+		mpi_data->cache_received = 1;
+		_starpu_mpi_cache_data_add_nolock(data_handle);
+		_starpu_mpi_cache_stats_inc(mpi_rank, data_handle);
 	}
+	else
+	{
+		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data_handle, mpi_rank);
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+	return already_received;
 }
 
-void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
+int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data_handle)
 {
-	struct _starpu_data_entry *avail;
-	int i, my_rank, nb_nodes;
-	int mpi_rank;
-	MPI_Comm comm = ((struct _starpu_mpi_data *) data_handle->mpi_data)->comm;
+	int already_received;
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return 0;
 
-	MPI_Comm_size(comm, &nb_nodes);
-	MPI_Comm_rank(comm, &my_rank);
-	mpi_rank = starpu_mpi_data_get_rank(data_handle);
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	STARPU_ASSERT(mpi_data->magic == 42);
+	already_received = mpi_data->cache_received;
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+	return already_received;
+}
 
-	for(i=0 ; i<nb_nodes ; i++)
+int starpu_mpi_cached_receive(starpu_data_handle_t data_handle)
+{
+	return _starpu_mpi_cache_received_data_get(data_handle);
+}
+
+/**************************************
+ * Send cache
+ **************************************/
+void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
+{
+	int n, size;
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+
+	if (_starpu_cache_enabled == 0)
+		return;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	starpu_mpi_comm_size(mpi_data->node_tag.comm, &size);
+	for(n=0 ; n<size ; n++)
 	{
-		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
-		if (avail)
+		if (mpi_data->cache_sent[n] == 1)
 		{
 			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
-			HASH_DEL(_cache_sent_data[i], avail);
-			free(avail);
-		}
-		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
-		if (avail)
-		{
-			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
-			HASH_DEL(_cache_received_data[i], avail);
-			_starpu_mpi_cache_stats_dec(i, data_handle);
-			free(avail);
+			mpi_data->cache_sent[n] = 0;
+			_starpu_mpi_cache_data_remove_nolock(data_handle);
 		}
 	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
 }
 
-void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
+int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
 {
-	int my_rank, mpi_rank;
-	_starpu_mpi_cache_flush( data_handle);
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	MPI_Comm_rank(comm, &my_rank);
-	mpi_rank = starpu_mpi_data_get_rank(data_handle);
-	if (mpi_rank != my_rank && mpi_rank != -1)
-		starpu_data_invalidate_submit(data_handle);
-}
+	if (_starpu_cache_enabled == 0)
+		return 0;
 
-void *_starpu_mpi_cache_received_data_set(starpu_data_handle_t data)
-{
-	int mpi_rank = starpu_mpi_data_get_rank(data);
-	if (_starpu_cache_enabled == 0) return NULL;
+	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
 
-	struct _starpu_data_entry *already_received;
-	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
-	if (already_received == NULL)
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	int already_sent = mpi_data->cache_sent[dest];
+	if (mpi_data->cache_sent[dest] == 0)
 	{
-		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
-		entry->data = data;
-		HASH_ADD_PTR(_cache_received_data[mpi_rank], data, entry);
-		_starpu_mpi_cache_stats_inc(mpi_rank, data);
+		mpi_data->cache_sent[dest] = 1;
+		_starpu_mpi_cache_data_add_nolock(data_handle);
+		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been sent to %d\n", data_handle, dest);
 	}
 	else
 	{
-		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
+		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data_handle, dest);
 	}
-	return already_received;
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+	return already_sent;
 }
 
-void *_starpu_mpi_cache_received_data_get(starpu_data_handle_t data)
+int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data_handle, int dest)
 {
-	int mpi_rank = starpu_mpi_data_get_rank(data);
-	struct _starpu_data_entry *already_received;
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+	int already_sent;
 
-	if (_starpu_cache_enabled == 0) return NULL;
-	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
-	return already_received;
+	if (_starpu_cache_enabled == 0)
+		return 0;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
+	already_sent = mpi_data->cache_sent[dest];
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+	return already_sent;
 }
 
-void *_starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest)
+int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest)
 {
-	if (_starpu_cache_enabled == 0) return NULL;
+	return _starpu_mpi_cache_sent_data_get(data_handle, dest);
+}
 
-	struct _starpu_data_entry *already_sent;
-	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
-	if (already_sent == NULL)
+static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle)
+{
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+	int i, nb_nodes;
+
+	if (_starpu_cache_enabled == 0)
+		return;
+
+	starpu_mpi_comm_size(mpi_data->node_tag.comm, &nb_nodes);
+	for(i=0 ; i<nb_nodes ; i++)
 	{
-		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
-		entry->data = data;
-		HASH_ADD_PTR(_cache_sent_data[dest], data, entry);
-		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been sent to %d\n", data, dest);
+		if (mpi_data->cache_sent[i] == 1)
+		{
+			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
+			mpi_data->cache_sent[i] = 0;
+			_starpu_mpi_cache_stats_dec(i, data_handle);
+		}
 	}
-	else
+
+	if (mpi_data->cache_received == 1)
 	{
-		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data, dest);
+		int mpi_rank = starpu_mpi_data_get_rank(data_handle);
+		_STARPU_MPI_DEBUG(2, "Clearing received cache for data %p\n", data_handle);
+		mpi_data->cache_received = 0;
+		_starpu_mpi_cache_stats_dec(mpi_rank, data_handle);
 	}
-	return already_sent;
 }
 
+void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
+{
+	if (_starpu_cache_enabled == 0)
+		return;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	_starpu_mpi_cache_flush_nolock(data_handle);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+}
+
+static void _starpu_mpi_cache_flush_and_invalidate_nolock(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	int my_rank, mpi_rank;
+
+	_starpu_mpi_cache_flush_nolock(data_handle);
+
+	starpu_mpi_comm_rank(comm, &my_rank);
+	mpi_rank = starpu_mpi_data_get_rank(data_handle);
+	if (mpi_rank != my_rank && mpi_rank != -1)
+		starpu_data_invalidate_submit(data_handle);
+}
+
+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	if (_starpu_cache_enabled == 0)
+		return;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	_starpu_mpi_cache_flush_and_invalidate_nolock(comm, data_handle);
+	_starpu_mpi_cache_data_remove_nolock(data_handle);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+}
+
+void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
+{
+	struct _starpu_data_entry *entry, *tmp;
+
+	if (_starpu_cache_enabled == 0)
+		return;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+	HASH_ITER(hh, _cache_data, entry, tmp)
+	{
+		_starpu_mpi_cache_flush_and_invalidate_nolock(comm, entry->data_handle);
+		HASH_DEL(_cache_data, entry);
+		free(entry);
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+}

+ 8 - 5
nmad/src/starpu_mpi_cache.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011-2014, 2017  Université de Bordeaux
  * Copyright (C) 2014 INRIA
  *
@@ -30,21 +30,24 @@ extern "C"
 
 extern int _starpu_cache_enabled;
 void _starpu_mpi_cache_init(MPI_Comm comm);
-void _starpu_mpi_cache_free(int world_size);
+void _starpu_mpi_cache_shutdown();
+void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle);
+void _starpu_mpi_cache_data_clear(starpu_data_handle_t data_handle);
 
 /*
  * If the data is already available in the cache, return a pointer to the data
  * If the data is NOT available in the cache, add it to the cache and return NULL
  */
-void *_starpu_mpi_cache_received_data_set(starpu_data_handle_t data);
-void *_starpu_mpi_cache_received_data_get(starpu_data_handle_t data);
+int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data);
+int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data);
 void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data);
 
 /*
  * If the data is already available in the cache, return a pointer to the data
  * If the data is NOT available in the cache, add it to the cache and return NULL
  */
-void *_starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest);
+int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest);
+int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data, int dest);
 void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data);
 
 void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle);

+ 10 - 17
nmad/src/starpu_mpi_cache_stats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2014, 2015  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,39 +19,34 @@
 #include <stdio.h>
 #include <starpu_mpi_private.h>
 
-/* measure the amount of data transfers between each pair of MPI nodes */
-static size_t *comm_cache_amount;
-static int world_size;
 static int stats_enabled=0;
 
-void _starpu_mpi_cache_stats_init(MPI_Comm comm)
+void _starpu_mpi_cache_stats_init()
 {
 	stats_enabled = starpu_get_env_number("STARPU_MPI_CACHE_STATS");
 	if (stats_enabled == -1)
 	{
 		stats_enabled = 0;
 	}
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
-	if (!_starpu_silent) fprintf(stderr,"Warning: StarPU is executed with STARPU_MPI_CACHE_STATS=1, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU is executed with STARPU_MPI_CACHE_STATS=1, which slows down a bit\n");
 
-	MPI_Comm_size(comm, &world_size);
-	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
-
-	comm_cache_amount = (size_t *) calloc(world_size, sizeof(size_t));
 }
 
-void _starpu_mpi_cache_stats_free()
+void _starpu_mpi_cache_stats_shutdown()
 {
-	if (stats_enabled == 0) return;
-	free(comm_cache_amount);
+	if (stats_enabled == 0)
+		return;
 }
 
 void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count)
 {
 	size_t size;
 
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
 	size = starpu_data_get_size(data_handle);
 
@@ -63,7 +58,5 @@ void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_hand
 	{
 		_STARPU_MPI_MSG("[communication cache] - %10ld from %d\n", (long)size, dst);
 	}
-
-	comm_cache_amount[dst] += count * size;
 }
 

+ 3 - 3
nmad/src/starpu_mpi_cache_stats.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2014, 2015  Centre National de la Recherche Scientifique
+ * Copyright (C) 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,8 +26,8 @@ extern "C"
 {
 #endif
 
-void _starpu_mpi_cache_stats_init(MPI_Comm comm);
-void _starpu_mpi_cache_stats_free();
+void _starpu_mpi_cache_stats_init();
+void _starpu_mpi_cache_stats_shutdown();
 
 void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count);
 

+ 51 - 58
nmad/src/starpu_mpi_collective.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -39,61 +39,81 @@ void _callback_collective(void *arg)
 	}
 }
 
-int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
+static
+int _callback_set(int rank, starpu_data_handle_t *data_handles, int count, int root, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg, void (**callback_func)(void *), struct _callback_arg **callback_arg)
 {
-	int rank;
-	int x;
-	struct _callback_arg *callback_arg = NULL;
-	void (*callback_func)(void *) = NULL;
 	void (*callback)(void *);
 
-	MPI_Comm_rank(comm, &rank);
-
 	callback = (rank == root) ? scallback : rcallback;
-	if (callback)
+	if (*callback)
 	{
-		callback_func = _callback_collective;
-		callback_arg = malloc(sizeof(struct _callback_arg));
-		callback_arg->count = 0;
-		callback_arg->nb = 0;
-		callback_arg->callback = (rank == root) ? scallback : rcallback;
-		callback_arg->arg = (rank == root) ? sarg : rarg;
+		int x;
+
+		*callback_func = _callback_collective;
+
+		_STARPU_MPI_MALLOC(*callback_arg, sizeof(struct _callback_arg));
+		(*callback_arg)->count = 0;
+		(*callback_arg)->nb = 0;
+		(*callback_arg)->callback = (rank == root) ? scallback : rcallback;
+		(*callback_arg)->arg = (rank == root) ? sarg : rarg;
 
 		for(x = 0; x < count ; x++)
 		{
 			if (data_handles[x])
 			{
 				int owner = starpu_mpi_data_get_rank(data_handles[x]);
-				int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
-				STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
+				int data_tag = starpu_mpi_data_get_tag(data_handles[x]);
+				STARPU_ASSERT_MSG(data_tag >= 0, "Invalid tag for data handle");
 				if ((rank == root) && (owner != root))
 				{
-					callback_arg->count ++;
+					(*callback_arg)->count ++;
 				}
 				if ((rank != root) && (owner == rank))
 				{
-					callback_arg->count ++;
+					(*callback_arg)->count ++;
 				}
 			}
 		}
+
+		if (!(*callback_arg)->count)
+		{
+			free(*callback_arg);
+			return 1;
+		}
 	}
 
+	return 0;
+}
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
+{
+	int rank;
+	int x;
+	struct _callback_arg *callback_arg = NULL;
+	void (*callback_func)(void *) = NULL;
+
+	starpu_mpi_comm_rank(comm, &rank);
+
+	x = _callback_set(rank, data_handles, count, root, scallback, sarg, rcallback, rarg, &callback_func, &callback_arg);
+	if (x == 1)
+		return 0;
+
 	for(x = 0; x < count ; x++)
 	{
 		if (data_handles[x])
 		{
 			int owner = starpu_mpi_data_get_rank(data_handles[x]);
-			int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
-			STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
+			int data_tag = starpu_mpi_data_get_tag(data_handles[x]);
+			STARPU_ASSERT_MSG(data_tag >= 0, "Invalid tag for data handle");
 			if ((rank == root) && (owner != root))
 			{
 				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
-				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
+				starpu_mpi_isend_detached(data_handles[x], owner, data_tag, comm, callback_func, callback_arg);
 			}
 			if ((rank != root) && (owner == rank))
 			{
 				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
-				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
+				starpu_mpi_irecv_detached(data_handles[x], root, data_tag, comm, callback_func, callback_arg);
 			}
 		}
 	}
@@ -106,56 +126,29 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 	int x;
 	struct _callback_arg *callback_arg = NULL;
 	void (*callback_func)(void *) = NULL;
-	void (*callback)(void *);
 
-	MPI_Comm_rank(comm, &rank);
-
-	callback = (rank == root) ? scallback : rcallback;
-	if (callback)
-	{
-		callback_func = _callback_collective;
+	starpu_mpi_comm_rank(comm, &rank);
 
-		callback_arg = malloc(sizeof(struct _callback_arg));
-		callback_arg->count = 0;
-		callback_arg->nb = 0;
-		callback_arg->callback = callback;
-		callback_arg->arg = (rank == root) ? sarg : rarg;
-
-		for(x = 0; x < count ; x++)
-		{
-			if (data_handles[x])
-			{
-				int owner = starpu_mpi_data_get_rank(data_handles[x]);
-				int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
-				STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
-				if ((rank == root) && (owner != root))
-				{
-					callback_arg->count ++;
-				}
-				if ((rank != root) && (owner == rank))
-				{
-					callback_arg->count ++;
-				}
-			}
-		}
-	}
+	x = _callback_set(rank, data_handles, count, root, scallback, sarg, rcallback, rarg, &callback_func, &callback_arg);
+	if (x == 1)
+		return 0;
 
 	for(x = 0; x < count ; x++)
 	{
 		if (data_handles[x])
 		{
 			int owner = starpu_mpi_data_get_rank(data_handles[x]);
-			int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
-			STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
+			int data_tag = starpu_mpi_data_get_tag(data_handles[x]);
+			STARPU_ASSERT_MSG(data_tag >= 0, "Invalid tag for data handle");
 			if ((rank == root) && (owner != root))
 			{
 				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
-				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
+				starpu_mpi_irecv_detached(data_handles[x], owner, data_tag, comm, callback_func, callback_arg);
 			}
 			if ((rank != root) && (owner == rank))
 			{
 				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
-				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
+				starpu_mpi_isend_detached(data_handles[x], root, data_tag, comm, callback_func, callback_arg);
 			}
 		}
 	}

+ 50 - 68
nmad/src/starpu_mpi_datatype.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011, 2015  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,10 +18,6 @@
 #include <starpu_mpi_datatype.h>
 #include <common/uthash.h>
 #include <datawizard/coherency.h>
-#include <starpu_mpi_private.h>
-
-typedef void (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
-typedef void (*handle_free_datatype_func)(MPI_Datatype *);
 
 struct _starpu_mpi_datatype_funcs
 {
@@ -44,7 +40,6 @@ void _starpu_mpi_datatype_shutdown(void)
 	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_datatype_funcs_table_mutex);
 }
 
-
 /*
  * 	Matrix
  */
@@ -148,28 +143,36 @@ static void handle_to_datatype_void(starpu_data_handle_t data_handle STARPU_ATTR
  *	Generic
  */
 
-static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
+static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
 {
 	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
 	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
 	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
-	[STARPU_CSR_INTERFACE_ID]	= NULL,
-	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_CSR_INTERFACE_ID]	= NULL, /* Sent through pack/unpack operations */
+	[STARPU_BCSR_INTERFACE_ID]	= NULL, /* Sent through pack/unpack operations */
 	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
 	[STARPU_VOID_INTERFACE_ID]	= handle_to_datatype_void,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 };
 
-void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype)
+void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req)
 {
 	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
 
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{
-		handle_to_datatype_func func = handle_to_datatype_funcs[id];
-		STARPU_ASSERT_MSG(func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
-		func(data_handle, datatype);
-		*user_datatype = 0;
+		starpu_mpi_datatype_allocate_func_t func = handle_to_datatype_funcs[id];
+		if (func)
+		{
+			func(data_handle, &req->datatype);
+			req->registered_datatype = 1;
+		}
+		else
+		{
+			/* The datatype is predefined by StarPU but it will be sent as a memory area */
+			req->datatype = MPI_BYTE;
+			req->registered_datatype = 0;
+		}
 	}
 	else
 	{
@@ -180,16 +183,27 @@ void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_
 		if (table)
 		{
 			STARPU_ASSERT_MSG(table->allocate_datatype_func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
-			table->allocate_datatype_func(data_handle, datatype);
-			*user_datatype = 0;
+			table->allocate_datatype_func(data_handle, &req->datatype);
+			req->registered_datatype = 1;
 		}
 		else
 		{
 			/* The datatype is not predefined by StarPU */
-			*datatype = MPI_BYTE;
-			*user_datatype = 1;
+			req->datatype = MPI_BYTE;
+			req->registered_datatype = 0;
 		}
 	}
+#ifdef STARPU_VERBOSE
+	{
+		char datatype_name[MPI_MAX_OBJECT_NAME];
+		int datatype_name_len;
+		MPI_Type_get_name(req->datatype, datatype_name, &datatype_name_len);
+		if (datatype_name_len == 0)
+			req->datatype_name = strdup("User defined datatype");
+		else
+			req->datatype_name = strdup(datatype_name);
+	}
+#endif
 }
 
 static void _starpu_mpi_handle_free_simple_datatype(MPI_Datatype *datatype)
@@ -199,17 +213,20 @@ static void _starpu_mpi_handle_free_simple_datatype(MPI_Datatype *datatype)
 
 static void _starpu_mpi_handle_free_complex_datatype(MPI_Datatype *datatype)
 {
-	int num_ints, num_adds, num_datatypes, combiner, i;
-	int *array_of_ints;
-	MPI_Aint *array_of_adds;
-	MPI_Datatype *array_of_datatypes;
+	int num_ints, num_adds, num_datatypes, combiner;
 
 	MPI_Type_get_envelope(*datatype, &num_ints, &num_adds, &num_datatypes, &combiner);
 	if (combiner != MPI_COMBINER_NAMED)
 	{
-		array_of_ints = (int *) malloc(num_ints * sizeof(int));
-		array_of_adds = (MPI_Aint *) malloc(num_adds * sizeof(MPI_Aint));
-		array_of_datatypes = (MPI_Datatype *) malloc(num_datatypes * sizeof(MPI_Datatype));
+		int *array_of_ints;
+		MPI_Aint *array_of_adds;
+		MPI_Datatype *array_of_datatypes;
+		int i;
+
+		_STARPU_MPI_MALLOC(array_of_ints, num_ints * sizeof(int));
+		_STARPU_MPI_MALLOC(array_of_adds, num_adds * sizeof(MPI_Aint));
+		_STARPU_MPI_MALLOC(array_of_datatypes, num_datatypes * sizeof(MPI_Datatype));
+
 		MPI_Type_get_contents(*datatype, num_ints, num_adds, num_datatypes, array_of_ints, array_of_adds, array_of_datatypes);
 		for(i=0 ; i<num_datatypes ; i++)
 		{
@@ -222,27 +239,27 @@ static void _starpu_mpi_handle_free_complex_datatype(MPI_Datatype *datatype)
 	}
 }
 
-static handle_free_datatype_func handle_free_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
+static starpu_mpi_datatype_free_func_t handle_free_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
 {
 	[STARPU_MATRIX_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
 	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
-	[STARPU_CSR_INTERFACE_ID]	= NULL,
-	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_CSR_INTERFACE_ID]	= NULL,  /* Sent through pack/unpack operations */
+	[STARPU_BCSR_INTERFACE_ID]	= NULL,  /* Sent through pack/unpack operations */
 	[STARPU_VARIABLE_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_VOID_INTERFACE_ID]      = _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 };
 
-void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
 
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{
-		handle_free_datatype_func func = handle_free_datatype_funcs[id];
-		STARPU_ASSERT_MSG(func, "Handle free datatype function not defined for StarPU data interface %d", id);
-		func(datatype);
+		starpu_mpi_datatype_free_func_t func = handle_free_datatype_funcs[id];
+		if (func)
+			func(datatype);
 	}
 	else
 	{
@@ -260,41 +277,6 @@ void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Data
 	/* else the datatype is not predefined by StarPU */
 }
 
-char *_starpu_mpi_datatype(MPI_Datatype datatype)
-{
-     if (datatype == MPI_DATATYPE_NULL) return "MPI_DATATYPE_NULL";
-     if (datatype == MPI_CHAR) return "MPI_CHAR";
-     if (datatype == MPI_UNSIGNED_CHAR) return "MPI_UNSIGNED_CHAR";
-     if (datatype == MPI_BYTE) return "MPI_BYTE";
-     if (datatype == MPI_SHORT) return "MPI_SHORT";
-     if (datatype == MPI_UNSIGNED_SHORT) return "MPI_UNSIGNED_SHORT";
-     if (datatype == MPI_INT) return "MPI_INT";
-     if (datatype == MPI_UNSIGNED) return "MPI_UNSIGNED";
-     if (datatype == MPI_LONG) return "MPI_LONG";
-     if (datatype == MPI_UNSIGNED_LONG) return "MPI_UNSIGNED_LONG";
-     if (datatype == MPI_FLOAT) return "MPI_FLOAT";
-     if (datatype == MPI_DOUBLE) return "MPI_DOUBLE";
-     if (datatype == MPI_LONG_DOUBLE) return "MPI_LONG_DOUBLE";
-     if (datatype == MPI_LONG_LONG) return "MPI_LONG_LONG";
-     if (datatype == MPI_LONG_INT) return "MPI_LONG_INT";
-     if (datatype == MPI_SHORT_INT) return "MPI_SHORT_INT";
-     if (datatype == MPI_FLOAT_INT) return "MPI_FLOAT_INT";
-     if (datatype == MPI_DOUBLE_INT) return "MPI_DOUBLE_INT";
-     if (datatype == MPI_2INT) return "MPI_2INT";
-     if (datatype == MPI_2DOUBLE_PRECISION) return "MPI_2DOUBLE_PRECISION";
-     if (datatype == MPI_COMPLEX) return "MPI_COMPLEX";
-     if (datatype == MPI_DOUBLE_COMPLEX) return "MPI_DOUBLE_COMPLEX";
-     if (datatype == MPI_LOGICAL) return "MPI_LOGICAL";
-     if (datatype == MPI_REAL) return "MPI_REAL";
-     if (datatype == MPI_REAL4) return "MPI_REAL4";
-     if (datatype == MPI_REAL8) return "MPI_REAL8";
-     if (datatype == MPI_DOUBLE_PRECISION) return "MPI_DOUBLE_PRECISION";
-     if (datatype == MPI_INTEGER) return "MPI_INTEGER";
-     if (datatype == MPI_INTEGER4) return "MPI_INTEGER4";
-     if (datatype == MPI_PACKED) return "MPI_PACKED";
-     return "User defined MPI Datatype";
-}
-
 int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func)
 {
 	enum starpu_data_interface_id id = starpu_data_get_interface_id(handle);

+ 6 - 5
nmad/src/starpu_mpi_datatype.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011, 2017  Université de Bordeaux
- * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux
+ * Copyright (C) 2010, 2012, 2013, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,7 @@
 #define __STARPU_MPI_DATATYPE_H__
 
 #include <starpu_mpi.h>
+#include <starpu_mpi_private.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -27,9 +28,9 @@ extern "C"
 
 void _starpu_mpi_datatype_init(void);
 void _starpu_mpi_datatype_shutdown(void);
-void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype);
-void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
-char *_starpu_mpi_datatype(MPI_Datatype datatype);
+
+void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req);
+void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
 
 #ifdef __cplusplus
 }

+ 73 - 44
nmad/src/starpu_mpi_fxt.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2017  Université de Bordeaux
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,69 +55,98 @@ extern "C"
 #define _STARPU_MPI_FUT_TEST_END			0x5220
 
 #ifdef STARPU_USE_FXT
-#define TRACE_MPI_START(rank, worldsize)	\
+#define _STARPU_MPI_TRACE_START(rank, worldsize)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_STOP(rank, worldsize)	\
+#define _STARPU_MPI_TRACE_STOP(rank, worldsize)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
-#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
+#define _STARPU_MPI_TRACE_BARRIER(rank, worldsize, key)	\
 	FUT_DO_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
 	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
-	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, mpi_tag, size, jobid)	\
+	FUT_DO_PROBE5(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), (jobid), _starpu_gettid());
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, mpi_tag)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
 	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_COMPLETE_BEGIN(type, rank, mpi_tag)		\
-	if (type == RECV_REQ) { TRACE_MPI_IRECV_COMPLETE_BEGIN((rank), (mpi_tag)); } else if (type == SEND_REQ) { TRACE_MPI_ISEND_COMPLETE_BEGIN((rank), (mpi_tag), 0); }
-#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
+#define _STARPU_MPI_TRACE_COMPLETE_BEGIN(type, rank, mpi_tag)		\
+	if (type == RECV_REQ) { _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN((rank), (mpi_tag)); } else if (type == SEND_REQ) { _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN((rank), (mpi_tag), 0); }
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
 	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(src, mpi_tag)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_COMPLETE_END(type, rank, mpi_tag)		\
-	if (type == RECV_REQ) { TRACE_MPI_IRECV_COMPLETE_END((rank), (mpi_tag)); } else if (type == SEND_REQ) { TRACE_MPI_ISEND_COMPLETE_END((rank), (mpi_tag), 0); }
-#define TRACE_MPI_SLEEP_BEGIN()	\
+#define _STARPU_MPI_TRACE_COMPLETE_END(type, rank, mpi_tag)		\
+	if (type == RECV_REQ) { _STARPU_MPI_TRACE_IRECV_COMPLETE_END((rank), (mpi_tag)); } else if (type == SEND_REQ) { _STARPU_MPI_TRACE_ISEND_COMPLETE_END((rank), (mpi_tag), 0); }
+#define _STARPU_MPI_TRACE_TERMINATED(req, rank, mpi_tag)		\
+	if ((req)->request_type == RECV_REQ) FUT_DO_PROBE4(_STARPU_MPI_FUT_IRECV_TERMINATED, (rank), (mpi_tag), (req)->post_sync_jobid, _starpu_gettid()); else \
+	if ((req)->request_type == SEND_REQ) FUT_DO_PROBE3(_STARPU_MPI_FUT_ISEND_TERMINATED, (rank), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
 	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
-#define TRACE_MPI_SLEEP_END()	\
+#define _STARPU_MPI_TRACE_SLEEP_END()	\
 	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
-#define TRACE_MPI_DTESTING_BEGIN()	\
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()	\
 	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
-#define TRACE_MPI_DTESTING_END()	\
+#define _STARPU_MPI_TRACE_DTESTING_END()	\
 	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
-#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(src, mpi_tag)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_UTESTING_END(src, mpi_tag)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
-#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(src, mpi_tag)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
-#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
+#define _STARPU_MPI_TRACE_UWAIT_END(src, mpi_tag)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_DATA_SET_RANK(handle, rank)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_DATA_SET_RANK, (handle), (rank), _starpu_gettid());
+#if 0
+/* This is very expensive in the trace, only enable for debugging */
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_TESTING_DETACHED_BEGIN, _starpu_gettid());
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_TESTING_DETACHED_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_TEST_BEGIN, (peer), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_TEST_END, (peer), (mpi_tag), _starpu_gettid());
+#else
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()		do {} while(0)
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)		do {} while(0)
+#endif
 #define TRACE
 #else
-#define TRACE_MPI_START(a, b)				do {} while(0);
-#define TRACE_MPI_STOP(a, b)				do {} while(0);
-#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
-#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
-#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
-#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
-#define TRACE_MPI_SLEEP_END()				do {} while(0);
-#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
-#define TRACE_MPI_DTESTING_END()			do {} while(0);
-#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
-#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
+#define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c, d)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_COMPLETE_BEGIN(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_COMPLETE_END(a, b, c)			do {} while(0);
+#define _STARPU_MPI_TRACE_TERMINATED(a, b, c)			do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(a, b)		do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_BEGIN()				do {} while(0);
+#define _STARPU_MPI_TRACE_SLEEP_END()				do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_BEGIN()			do {} while(0);
+#define _STARPU_MPI_TRACE_DTESTING_END()			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UTESTING_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_DATA_SET_RANK(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()		do {} while(0)
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)		do {} while(0)
 #endif
 
 #ifdef __cplusplus

+ 2 - 2
nmad/src/starpu_mpi_helper.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2017  Université de Bordeaux
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2015, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2012, 2014, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 233 - 0
nmad/src/starpu_mpi_init.c

@@ -0,0 +1,233 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_datatype.h>
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_cache.h>
+#include <starpu_profiling.h>
+#include <starpu_mpi_stats.h>
+#include <starpu_mpi_cache.h>
+#include <starpu_mpi_sync_data.h>
+#include <starpu_mpi_early_data.h>
+#include <starpu_mpi_early_request.h>
+#include <starpu_mpi_select_node.h>
+#include <starpu_mpi_tag.h>
+#include <common/config.h>
+#include <common/thread.h>
+#include <datawizard/interfaces/data_interface.h>
+#include <datawizard/coherency.h>
+#include <core/simgrid.h>
+#include <core/task.h>
+
+#ifdef STARPU_SIMGRID
+static int _mpi_world_size;
+static int _mpi_world_rank;
+#endif
+
+static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
+{
+	switch (thread_level)
+	{
+		case MPI_THREAD_SERIALIZED:
+		{
+			_STARPU_DEBUG("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
+			break;
+		}
+		case MPI_THREAD_FUNNELED:
+		{
+			_STARPU_DISP("MPI%s MPI_THREAD_FUNNELED; The application can safely make calls to StarPU-MPI functions, but should not call directly MPI communication functions.\n", msg);
+			break;
+		}
+		case MPI_THREAD_SINGLE:
+		{
+			_STARPU_DISP("MPI%s MPI_THREAD_SINGLE; MPI does not have multi-thread support, this might cause problems. The application can make calls to StarPU-MPI functions, but not call directly MPI Communication functions.\n", msg);
+			break;
+		}
+	}
+}
+
+void _starpu_mpi_do_initialize(struct _starpu_mpi_argc_argv *argc_argv)
+{
+	if (argc_argv->initialize_mpi)
+	{
+		int thread_support;
+		_STARPU_DEBUG("Calling MPI_Init_thread\n");
+		if (MPI_Init_thread(argc_argv->argc, argc_argv->argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
+		{
+			_STARPU_ERROR("MPI_Init_thread failed\n");
+		}
+		_starpu_mpi_print_thread_level_support(thread_support, "_Init_thread level =");
+	}
+	else
+	{
+		int provided;
+		MPI_Query_thread(&provided);
+		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
+	}
+
+	MPI_Comm_rank(argc_argv->comm, &argc_argv->rank);
+	MPI_Comm_size(argc_argv->comm, &argc_argv->world_size);
+	MPI_Comm_set_errhandler(argc_argv->comm, MPI_ERRORS_RETURN);
+
+#ifdef STARPU_SIMGRID
+	_mpi_world_size = argc_argv->world_size;
+	_mpi_world_rank = argc_argv->rank;
+#endif
+}
+
+static
+int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm)
+{
+	struct _starpu_mpi_argc_argv *argc_argv;
+	_STARPU_MALLOC(argc_argv, sizeof(struct _starpu_mpi_argc_argv));
+	argc_argv->initialize_mpi = initialize_mpi;
+	argc_argv->argc = argc;
+	argc_argv->argv = argv;
+	argc_argv->comm = comm;
+
+#ifdef STARPU_SIMGRID
+	/* Call MPI_Init_thread as early as possible, to initialize simgrid
+	 * before working with mutexes etc. */
+	_starpu_mpi_do_initialize(argc_argv);
+#endif
+
+	return _starpu_mpi_progress_init(argc_argv);
+}
+
+#ifdef STARPU_SIMGRID
+/* This is called before application's main, to initialize SMPI before we can
+ * create MSG processes to run application's main */
+int _starpu_mpi_simgrid_init(int argc, char *argv[])
+{
+	return _starpu_mpi_initialize(&argc, &argv, 1, MPI_COMM_WORLD);
+}
+#endif
+
+int starpu_mpi_init_comm(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv STARPU_ATTRIBUTE_UNUSED, int initialize_mpi STARPU_ATTRIBUTE_UNUSED, MPI_Comm comm STARPU_ATTRIBUTE_UNUSED)
+{
+#ifdef STARPU_SIMGRID
+	_starpu_mpi_wait_for_initialization();
+	return 0;
+#else
+	return _starpu_mpi_initialize(argc, argv, initialize_mpi, comm);
+#endif
+}
+
+int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
+{
+	return starpu_mpi_init_comm(argc, argv, initialize_mpi, MPI_COMM_WORLD);
+}
+
+int starpu_mpi_initialize(void)
+{
+#ifdef STARPU_SIMGRID
+	return 0;
+#else
+	return _starpu_mpi_initialize(NULL, NULL, 0, MPI_COMM_WORLD);
+#endif
+}
+
+int starpu_mpi_initialize_extended(int *rank, int *world_size)
+{
+#ifdef STARPU_SIMGRID
+	*world_size = _mpi_world_size;
+	*rank = _mpi_world_rank;
+	return 0;
+#else
+	int ret;
+
+	ret = _starpu_mpi_initialize(NULL, NULL, 1, MPI_COMM_WORLD);
+	if (ret == 0)
+	{
+		_STARPU_DEBUG("Calling MPI_Comm_rank\n");
+		MPI_Comm_rank(MPI_COMM_WORLD, rank);
+		MPI_Comm_size(MPI_COMM_WORLD, world_size);
+	}
+	return ret;
+#endif
+}
+
+int starpu_mpi_shutdown(void)
+{
+	int value;
+	int rank, world_size;
+
+	/* We need to get the rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
+
+	/* kill the progression thread */
+	_starpu_mpi_progress_shutdown(&value);
+
+	_STARPU_MPI_TRACE_STOP(rank, world_size);
+
+	_starpu_mpi_comm_amounts_display(stderr, rank);
+	_starpu_mpi_comm_amounts_shutdown();
+	_starpu_mpi_cache_shutdown(world_size);
+
+	return 0;
+}
+
+int starpu_mpi_comm_size(MPI_Comm comm, int *size)
+{
+	if (_starpu_mpi_fake_world_size != -1)
+	{
+		*size = _starpu_mpi_fake_world_size;
+		return 0;
+	}
+#ifdef STARPU_SIMGRID
+	STARPU_MPI_ASSERT_MSG(comm == MPI_COMM_WORLD, "StarPU-SMPI only works with MPI_COMM_WORLD for now");
+	*size = _mpi_world_size;
+	return 0;
+#else
+	return MPI_Comm_size(comm, size);
+#endif
+}
+
+int starpu_mpi_comm_rank(MPI_Comm comm, int *rank)
+{
+	if (_starpu_mpi_fake_world_rank != -1)
+	{
+		*rank = _starpu_mpi_fake_world_rank;
+		return 0;
+	}
+#ifdef STARPU_SIMGRID
+	STARPU_MPI_ASSERT_MSG(comm == MPI_COMM_WORLD, "StarPU-SMPI only works with MPI_COMM_WORLD for now");
+	*rank = _mpi_world_rank;
+	return 0;
+#else
+	return MPI_Comm_rank(comm, rank);
+#endif
+}
+
+int starpu_mpi_world_size(void)
+{
+	int size;
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+	return size;
+}
+
+int starpu_mpi_world_rank(void)
+{
+	int rank;
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	return rank;
+}
+

+ 35 - 0
nmad/src/starpu_mpi_init.h

@@ -0,0 +1,35 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012-2015  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_INIT_H__
+#define __STARPU_MPI_INIT_H__
+
+#include <starpu.h>
+#include <starpu_mpi.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void _starpu_mpi_do_initialize(struct _starpu_mpi_argc_argv *argc_argv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_INIT_H__

+ 283 - 200
nmad/src/starpu_mpi.c

@@ -16,6 +16,7 @@
  */
 
 #include <stdlib.h>
+#include <limits.h>
 #include <starpu_mpi.h>
 #include <starpu_mpi_datatype.h>
 #include <starpu_mpi_private.h>
@@ -24,11 +25,16 @@
 #include <starpu_mpi_stats.h>
 #include <starpu_mpi_cache.h>
 #include <starpu_mpi_select_node.h>
+#include <starpu_mpi_tag.h>
+#include <starpu_mpi_comm.h>
+#include <starpu_mpi_init.h>
 #include <common/config.h>
 #include <common/thread.h>
 #include <datawizard/coherency.h>
 #include <nm_sendrecv_interface.h>
 #include <nm_mpi_nmad.h>
+#include <core/task.h>
+#include <core/topology.h>
 
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event);
 #ifdef STARPU_VERBOSE
@@ -43,9 +49,16 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 static void _starpu_mpi_handle_new_request(void *arg);
 
 static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req);
+static void _starpu_mpi_add_sync_point_in_fxt(void);
+
+static int mpi_thread_cpuid = -1;
+int _starpu_mpi_fake_world_size = -1;
+int _starpu_mpi_fake_world_rank = -1;
 
 /* Condition to wake up waiting for all current MPI requests to finish */
 static starpu_pthread_t progress_thread;
+static starpu_pthread_cond_t progress_cond;
+static starpu_pthread_mutex_t progress_mutex;
 static volatile int running = 0;
 
 /* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
@@ -54,13 +67,73 @@ static volatile int pending_request = 0;
 
 #define REQ_FINALIZED 0x1
 
-
-
 PUK_LFSTACK_TYPE(callback,	struct _starpu_mpi_req *req;);
 static callback_lfstack_t callback_stack = NULL;
 
 static starpu_sem_t callback_sem;
 
+static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
+{
+	_STARPU_MPI_CALLOC(*req, 1, sizeof(struct _starpu_mpi_req));
+
+	/* Initialize the request structure */
+	(*req)->data_handle = NULL;
+	(*req)->prio = 0;
+	(*req)->completed = 0;
+
+	(*req)->datatype = 0;
+	(*req)->datatype_name = NULL;
+	(*req)->ptr = NULL;
+	(*req)->count = -1;
+	(*req)->registered_datatype = -1;
+
+	(*req)->node_tag.rank = -1;
+	(*req)->node_tag.data_tag = -1;
+	(*req)->node_tag.comm = 0;
+
+	(*req)->func = NULL;
+
+	(*req)->status = NULL;
+	//	(*req)->data_request = 0;
+	(*req)->flag = NULL;
+
+	(*req)->ret = -1;
+	piom_cond_init(&((*req)->req_cond), 0);
+	//STARPU_PTHREAD_MUTEX_INIT(&((*req)->req_mutex), NULL);
+	STARPU_PTHREAD_COND_INIT(&((*req)->req_cond), NULL);
+	//	STARPU_PTHREAD_MUTEX_INIT(&((*req)->posted_mutex), NULL);
+	//STARPU_PTHREAD_COND_INIT(&((*req)->posted_cond), NULL);
+
+	(*req)->request_type = UNKNOWN_REQ;
+
+	(*req)->submitted = 0;
+	(*req)->completed = 0;
+	(*req)->posted = 0;
+
+	//(*req)->other_request = NULL;
+
+	(*req)->sync = 0;
+	(*req)->detached = -1;
+	(*req)->callback = NULL;
+	(*req)->callback_arg = NULL;
+
+	//	(*req)->size_req = 0;
+	//(*req)->internal_req = NULL;
+	//(*req)->is_internal_req = 0;
+	//(*req)->to_destroy = 1;
+	//(*req)->early_data_handle = NULL;
+	//(*req)->envelope = NULL;
+	(*req)->sequential_consistency = 1;
+	(*req)->pre_sync_jobid = -1;
+	(*req)->post_sync_jobid = -1;
+
+#ifdef STARPU_SIMGRID
+	starpu_pthread_queue_init(&((*req)->queue));
+	starpu_pthread_queue_register(&wait, &((*req)->queue));
+	(*req)->done = 0;
+#endif
+}
+
 static void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req)
 {
 	piom_cond_destroy(&(req->req_cond));
@@ -73,6 +146,11 @@ static void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req)
 /*                                                      */
 /********************************************************/
 
+static void nop_acquire_cb(void *arg)
+{
+	starpu_data_release(arg);
+}
+
 static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle_t data_handle,
 							      int srcdst, int data_tag, MPI_Comm comm,
 							      unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
@@ -81,32 +159,34 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 							      int sequential_consistency)
 {
 
-	_STARPU_MPI_LOG_IN();
-	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
-	STARPU_ASSERT_MSG(req, "Invalid request");
+	struct _starpu_mpi_req *req;
+
+	if (_starpu_mpi_fake_world_size != -1)
+	{
+		/* Don't actually do the communication */
+		starpu_data_acquire_on_node_cb_sequential_consistency(data_handle, STARPU_MAIN_RAM, mode, nop_acquire_cb, data_handle, sequential_consistency);
+		return NULL;
+	}
 
+	_STARPU_MPI_LOG_IN();
 	STARPU_ATOMIC_ADD( &pending_request, 1);
 
 	/* Initialize the request structure */
-	req->completed = 0;
-	piom_cond_init(&req->req_cond, 0);
-
+	_starpu_mpi_request_init(&req);
 	req->request_type = request_type;
+	/* prio_list is sorted by increasing values */
 	req->prio = prio;
-	req->user_datatype = -1;
-	req->count = -1;
 	req->data_handle = data_handle;
-	req->srcdst = srcdst;
-	req->mpi_tag = data_tag;
-	req->comm = comm;
-	nm_mpi_nmad_dest(&req->session, &req->gate, comm, req->srcdst);
-
+	req->node_tag.rank = srcdst;
+	req->node_tag.data_tag = data_tag;
+	req->node_tag.comm = comm;
 	req->detached = detached;
 	req->sync = sync;
 	req->callback = callback;
 	req->callback_arg = arg;
-
 	req->func = func;
+	req->sequential_consistency = sequential_consistency;
+	nm_mpi_nmad_dest(&req->session, &req->gate, comm, req->node_tag.rank);
 
 	/* Asynchronously request StarPU to fetch the data in main memory: when
 	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
@@ -127,11 +207,11 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(2, "post MPI isend request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+	_STARPU_MPI_DEBUG(30, "post MPI isend request %p type %s tag %d src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
 
-	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, req->count);
+	_starpu_mpi_comm_amounts_inc(req->node_tag.comm, req->node_tag.rank, req->datatype, req->count);
 
-	TRACE_MPI_ISEND_SUBMIT_BEGIN(req->srcdst, req->mpi_tag, 0);
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag, 0);
 
 	struct nm_data_s data;
 	nm_mpi_nmad_data(&data, (void*)req->ptr, req->datatype, req->count);
@@ -141,17 +221,16 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 
 	if (req->sync == 0)
 	{
-		req->ret = nm_sr_send_isend(req->session, &(req->request), req->gate, req->mpi_tag);
-
+		req->ret = nm_sr_send_isend(req->session, &(req->request), req->gate, req->node_tag.data_tag);
 		STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Isend returning %d", req->ret);
 	}
 	else
 	{
-		req->ret = nm_sr_send_issend(req->session, &(req->request), req->gate, req->mpi_tag);
+		req->ret = nm_sr_send_issend(req->session, &(req->request), req->gate, req->node_tag.data_tag);
 		STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Issend returning %d", req->ret);
 	}
 
-	TRACE_MPI_ISEND_SUBMIT_END(req->srcdst, req->mpi_tag, starpu_data_get_size(req->data_handle));
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
 
 	_starpu_mpi_handle_pending_request(req);
 
@@ -160,8 +239,9 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 
 static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 {
-	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
-	if (req->user_datatype == 0)
+	_starpu_mpi_datatype_allocate(req->data_handle, req);
+
+	if (req->registered_datatype == 1)
 	{
 		req->waited = 1;
 		req->count = 1;
@@ -179,12 +259,12 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 		if (psize != -1)
 		{
 			// We already know the size of the data, let's send it to overlap with the packing of the data
-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (first call to pack)\n", psize, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), req->mpi_tag, req->srcdst);			req->count = psize;
+			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", psize, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
+			req->count = psize;
 			//ret = nm_sr_isend(nm_mpi_communicator_get_session(p_req->p_comm),nm_mpi_communicator_get_gate(p_comm,req->srcdst), req->mpi_tag,&req->count, sizeof(req->count), &req->size_req);
-			ret = nm_sr_isend(req->session,req->gate, req->mpi_tag,&req->count, sizeof(req->count), &req->size_req);
+			ret = nm_sr_isend(req->session,req->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->size_req);
 
-
-		//	ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->size_req);
+			//	ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->size_req);
 			STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
 		}
 
@@ -193,8 +273,8 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 		if (psize == -1)
 		{
 			// We know the size now, let's send it
-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", req->count, sizeof(req->count), _starpu_mpi_datatype(MPI_BYTE), req->mpi_tag, req->srcdst);
-			ret = nm_sr_isend(req->session,req->gate, req->mpi_tag,&req->count, sizeof(req->count), &req->size_req);
+			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %d to node %d (second call to pack)\n", req->count, sizeof(req->count), "MPI_BYTE", req->node_tag.data_tag, req->node_tag.rank);
+			ret = nm_sr_isend(req->session,req->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->size_req);
 			STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
 		}
 		else
@@ -221,9 +301,9 @@ int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *publ
 	STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_isend needs a valid starpu_mpi_req");
 
 	struct _starpu_mpi_req *req;
-	TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, data_tag, 0);
+	_STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, data_tag, 0);
 	req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 0, prio, NULL, NULL);
-	TRACE_MPI_ISEND_COMPLETE_END(dest, data_tag, 0);
+	_STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, data_tag, 0);
 
 	STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
 	*public_req = req;
@@ -315,18 +395,18 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(2, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+	_STARPU_MPI_DEBUG(20, "post MPI irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
-	TRACE_MPI_IRECV_SUBMIT_BEGIN(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
 
 	//req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
 	struct nm_data_s data;
 	nm_mpi_nmad_data(&data, (void*)req->ptr, req->datatype, req->count);
 	nm_sr_recv_init(req->session, &(req->request));
 	nm_sr_recv_unpack_data(req->session, &(req->request), &data);
-	nm_sr_recv_irecv(req->session, &(req->request), req->gate, req->mpi_tag,NM_TAG_MASK_FULL);
+	nm_sr_recv_irecv(req->session, &(req->request), req->gate, req->node_tag.data_tag, NM_TAG_MASK_FULL);
 
-	TRACE_MPI_IRECV_SUBMIT_END(req->srcdst, req->mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag);
 
 	_starpu_mpi_handle_pending_request(req);
 
@@ -354,8 +434,8 @@ static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
-	if (req->user_datatype == 0)
+	_starpu_mpi_datatype_allocate(req->data_handle, req);
+	if (req->registered_datatype == 1)
 	{
 		req->count = 1;
 		req->ptr = starpu_data_get_local_ptr(req->data_handle);
@@ -366,8 +446,8 @@ static void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 		struct _starpu_mpi_irecv_size_callback *callback = malloc(sizeof(struct _starpu_mpi_irecv_size_callback));
 		callback->req = req;
 		starpu_variable_data_register(&callback->handle, 0, (uintptr_t)&(callback->req->count), sizeof(callback->req->count));
-		_STARPU_MPI_DEBUG(4, "Receiving size with tag %d from node %d\n", req->mpi_tag, req->srcdst);
-		_starpu_mpi_irecv_common(callback->handle, req->srcdst, req->mpi_tag, req->comm, 1, 0, _starpu_mpi_irecv_size_callback, callback,1);
+		_STARPU_MPI_DEBUG(4, "Receiving size with tag %d from node %d\n", req->node_tag.data_tag, req->node_tag.rank);
+		_starpu_mpi_irecv_common(callback->handle, req->node_tag.rank, req->node_tag.data_tag, req->node_tag.comm, 1, 0, _starpu_mpi_irecv_size_callback, callback,1);
 	}
 
 }
@@ -383,9 +463,9 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	STARPU_ASSERT_MSG(public_req, "starpu_mpi_irecv needs a valid starpu_mpi_req");
 
 	struct _starpu_mpi_req *req;
-	TRACE_MPI_IRECV_COMPLETE_BEGIN(source, mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, mpi_tag);
 	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, 0, NULL, NULL,1);
-	TRACE_MPI_IRECV_COMPLETE_END(source, mpi_tag);
+	_STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, mpi_tag);
 
 	STARPU_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
 	*public_req = req;
@@ -444,10 +524,8 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	struct _starpu_mpi_req *req = *public_req;
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Wait cannot be called on a detached request");
 
-
-/* we must do a test_locked to avoid race condition :
- * without req_cond could still be used and couldn't be freed)*/
-
+	/* we must do a test_locked to avoid race condition :
+	 * without req_cond could still be used and couldn't be freed)*/
 	while (!req->completed || ! piom_cond_test_locked(&(req->req_cond),REQ_FINALIZED))
 	{
 		piom_cond_wait(&(req->req_cond),REQ_FINALIZED);
@@ -468,26 +546,25 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 /*                                                      */
 /********************************************************/
 
-
 int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 {
-
 	_STARPU_MPI_LOG_IN();
 	STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_test needs a valid starpu_mpi_req");
 	struct _starpu_mpi_req *req = *public_req;
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
-	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
-	TRACE_MPI_UTESTING_BEGIN(req->srcdst, req->mpi_tag);
-
-/* we must do a test_locked to avoid race condition :
- * without req_cond could still be used and couldn't be freed)*/
+	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
 
+	/* we must do a test_locked to avoid race condition :
+	 * without req_cond could still be used and couldn't be freed)*/
 	*flag = req->completed && piom_cond_test_locked(&(req->req_cond),REQ_FINALIZED);
 	if (*flag && status!=MPI_STATUS_IGNORE)
 		_starpu_mpi_req_status(req,status);
-	TRACE_MPI_UTESTING_END(req->srcdst, req->mpi_tag);
+
+	_STARPU_MPI_TRACE_UTESTING_END(req->node_tag.rank, req->node_tag.data_tag);
+
 	if(*flag)
 	{
 		_starpu_mpi_request_destroy(req);
@@ -507,7 +584,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 {
 	_STARPU_MPI_LOG_IN();
 	int ret;
-//	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
+	//	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
 	ret = MPI_Barrier(comm);
 
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %d", ret);
@@ -539,15 +616,14 @@ static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type
 
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event)
 {
-
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+	_STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
 	{
-		if (req->user_datatype == 1)
+		if (req->registered_datatype == 0)
 		{
 			if (req->request_type == SEND_REQ)
 			{
@@ -569,7 +645,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,n
 		}
 		else
 		{
-			_starpu_mpi_handle_free_datatype(req->data_handle, &req->datatype);
+			_starpu_mpi_datatype_free(req->data_handle, &req->datatype);
 		}
 		starpu_data_release(req->data_handle);
 	}
@@ -614,11 +690,9 @@ void _starpu_mpi_handle_request_termination_callback(nm_sr_event_t event, const
 
 static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req)
 {
-
 	if(req->request_type == SEND_REQ && req->waited>1)
 	{
 		nm_sr_request_set_ref(&(req->size_req), req);
-
 		nm_sr_request_monitor(req->session, &(req->size_req), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
 	}
 	/* the if must be before, because the first callback can directly free
@@ -635,20 +709,13 @@ static void _starpu_mpi_handle_new_request(void *arg)
 	STARPU_ASSERT_MSG(req, "Invalid request");
 
 	/* submit the request to MPI */
-	_STARPU_MPI_DEBUG(2, "Handling new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+	_STARPU_MPI_DEBUG(2, "Handling new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 	req->func(req);
 
 	_STARPU_MPI_LOG_OUT();
 }
 
-struct _starpu_mpi_argc_argv
-{
-	int initialize_mpi;
-	int *argc;
-	char ***argv;
-};
-
 static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
 {
 	switch (thread_level)
@@ -675,22 +742,61 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 {
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 
+	starpu_pthread_setname("MPI");
+
+#ifndef STARPU_SIMGRID
+	if (mpi_thread_cpuid >= 0)
+		_starpu_bind_thread_on_cpu(mpi_thread_cpuid, STARPU_NOWORKERID);
+	_starpu_mpi_do_initialize(argc_argv);
+	if (mpi_thread_cpuid >= 0)
+		/* In case MPI changed the binding */
+		_starpu_bind_thread_on_cpu(mpi_thread_cpuid, STARPU_NOWORKERID);
+#endif
+
+	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
+	_starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
+
+#ifdef STARPU_SIMGRID
+	/* Now that MPI is set up, let the rest of simgrid get initialized */
+	char **argv_cpy;
+	_STARPU_MPI_MALLOC(argv_cpy, *(argc_argv->argc) * sizeof(char*));
+	int i;
+	for (i = 0; i < *(argc_argv->argc); i++)
+		argv_cpy[i] = strdup((*(argc_argv->argv))[i]);
+	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
+	/* And set TSD for us */
+	void **tsd;
+	_STARPU_CALLOC(tsd, MAX_TSD + 1, sizeof(void*));
+	if (!smpi_process_set_user_data)
 	{
-		int provided;
-		MPI_Query_thread(&provided);
-		_starpu_mpi_print_thread_level_support(provided, " has been initialized with");
+		_STARPU_ERROR("Your version of simgrid does not provide smpi_process_set_user_data, we can not continue without it\n");
 	}
+	smpi_process_set_user_data(tsd);
+#endif
+
+#ifdef STARPU_USE_FXT
+	_starpu_fxt_wait_initialisation();
+#endif //STARPU_USE_FXT
 
 	{
-	     int rank, worldsize;
-	     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
-	     TRACE_MPI_START(rank, worldsize);
+		_STARPU_MPI_TRACE_START(argc_argv->rank, argc_argv->world_size);
 #ifdef STARPU_USE_FXT
-	     starpu_profiling_set_id(rank);
+		starpu_profiling_set_id(argc_argv->rank);
 #endif //STARPU_USE_FXT
 	}
 
+	_starpu_mpi_add_sync_point_in_fxt();
+	_starpu_mpi_comm_amounts_init(argc_argv->comm);
+	_starpu_mpi_cache_init(argc_argv->comm);
+	_starpu_mpi_select_node_init();
+	_starpu_mpi_datatype_init();
+
+	/* notify the main thread that the progression thread is ready */
+	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+	running = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
+
 	while (1)
 	{
 		struct callback_lfstack_cell_s* c = callback_lfstack_pop(&callback_stack);
@@ -712,7 +818,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				}
 				else
 				{
-					if  (pending_request==0)
+					if (pending_request==0)
 						break;
 				}
 				continue;
@@ -733,12 +839,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		STARPU_ATOMIC_ADD( &pending_request, -1);
 		/* we signal that the request is completed.*/
 
-
 		free(c);
 
 	}
-		STARPU_ASSERT_MSG(callback_lfstack_pop(&callback_stack)==NULL, "List of callback not empty.");
-		STARPU_ASSERT_MSG(pending_request==0, "Request still pending.");
+	STARPU_ASSERT_MSG(callback_lfstack_pop(&callback_stack)==NULL, "List of callback not empty.");
+	STARPU_ASSERT_MSG(pending_request==0, "Request still pending.");
 
 	if (argc_argv->initialize_mpi)
 	{
@@ -768,11 +873,11 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	int worldsize;
 	int ret;
 
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
 
 	ret = MPI_Barrier(MPI_COMM_WORLD);
-	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %d", ret);
+	STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_error_code(ret));
 
 	/* We generate a "unique" key so that we can make sure that different
 	 * FxT traces come from the same MPI run. */
@@ -787,102 +892,52 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	}
 
 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
-	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %d", ret);
+	STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %s", _starpu_mpi_get_mpi_error_code(ret));
 
-	TRACE_MPI_BARRIER(rank, worldsize, random_number);
+	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
 
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
 #endif
 }
 
-static
-int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
+int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 {
-
-	struct _starpu_mpi_argc_argv *argc_argv = malloc(sizeof(struct _starpu_mpi_argc_argv));
-	argc_argv->initialize_mpi = initialize_mpi;
-	argc_argv->argc = argc;
-	argc_argv->argv = argv;
-
-
-	if (initialize_mpi)
-	{
-		int thread_support;
-		_STARPU_DEBUG("Calling MPI_Init_thread\n");
-		if (MPI_Init_thread(argc_argv->argc, argc_argv->argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
-		{
-			_STARPU_ERROR("MPI_Init_thread failed\n");
-		}
-		_starpu_mpi_print_thread_level_support(thread_support, "_Init_thread level =");
-	}
+        STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
+        STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
 
 	starpu_sem_init(&callback_sem, 0, 0);
-	running = 1;
+	running = 0;
+	mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
 
 	STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
 
-	_starpu_mpi_add_sync_point_in_fxt();
-	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
-	_starpu_mpi_cache_init(MPI_COMM_WORLD);
-	_starpu_mpi_select_node_init();
-	_starpu_mpi_datatype_init();
-	return 0;
-}
-
-int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
-{
-	return _starpu_mpi_initialize(argc, argv, initialize_mpi);
-}
-
-int starpu_mpi_initialize(void)
-{
-	return _starpu_mpi_initialize(NULL, NULL, 0);
-}
-
-int starpu_mpi_initialize_extended(int *rank, int *world_size)
-{
-	int ret;
+        STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+        while (!running)
+                STARPU_PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
+        STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 
-	ret = _starpu_mpi_initialize(NULL, NULL, 1);
-	if (ret == 0)
-	{
-		_STARPU_DEBUG("Calling MPI_Comm_rank\n");
-		MPI_Comm_rank(MPI_COMM_WORLD, rank);
-		MPI_Comm_size(MPI_COMM_WORLD, world_size);
-	}
-	return ret;
+        return 0;
 }
 
-int starpu_mpi_shutdown(void)
+void _starpu_mpi_progress_shutdown(int *value)
 {
-	void *value;
-	int rank, world_size;
-
-	/* We need to get the rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-
 	/* kill the progression thread */
-	running = 0;
-	starpu_sem_post(&callback_sem);
+        STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+        running = 0;
+        STARPU_PTHREAD_COND_BROADCAST(&progress_cond);
+        STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 
+	starpu_sem_post(&callback_sem);
 
 	starpu_pthread_join(progress_thread, &value);
 
-	TRACE_MPI_STOP(rank, world_size);
-
-
-	_starpu_mpi_comm_amounts_display(rank);
-	_starpu_mpi_comm_amounts_free();
-	_starpu_mpi_cache_free(world_size);
-	_starpu_mpi_datatype_shutdown();
-	return 0;
+        STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
+        STARPU_PTHREAD_COND_DESTROY(&progress_cond);
 }
 
-void _starpu_mpi_clear_cache(starpu_data_handle_t data_handle)
+void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
 {
-//	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
-	_starpu_mpi_cache_flush(data_handle);
+	_starpu_mpi_cache_data_clear(data_handle);
 	free(data_handle->mpi_data);
 }
 
@@ -895,19 +950,25 @@ void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, int tag, in
 	}
 	else
 	{
-		mpi_data = malloc(sizeof(struct _starpu_mpi_data));
+		_STARPU_CALLOC(mpi_data, 1, sizeof(struct _starpu_mpi_data));
+		mpi_data->magic = 42;
+		mpi_data->node_tag.data_tag = -1;
+		mpi_data->node_tag.rank = -1;
+		mpi_data->node_tag.comm = MPI_COMM_WORLD;
 		data_handle->mpi_data = mpi_data;
-		_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_clear_cache);
+		_starpu_mpi_cache_data_init(data_handle);
+		_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_data_clear);
 	}
 
 	if (tag != -1)
 	{
-		mpi_data->tag = tag;
+		mpi_data->node_tag.data_tag = tag;
 	}
 	if (rank != -1)
 	{
-		mpi_data->rank = rank;
-		mpi_data->comm = comm;
+		_STARPU_MPI_TRACE_DATA_SET_RANK(data_handle, rank);
+		mpi_data->node_tag.rank = rank;
+		mpi_data->node_tag.comm = comm;
 	}
 }
 
@@ -924,36 +985,13 @@ void starpu_mpi_data_set_tag(starpu_data_handle_t handle, int tag)
 int starpu_mpi_data_get_rank(starpu_data_handle_t data)
 {
 	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
-	return ((struct _starpu_mpi_data *)(data->mpi_data))->rank;
+	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.rank;
 }
 
 int starpu_mpi_data_get_tag(starpu_data_handle_t data)
 {
 	STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
-	return ((struct _starpu_mpi_data *)(data->mpi_data))->tag;
-}
-
-
-int starpu_mpi_comm_size(MPI_Comm comm, int *size)
-{
-#ifdef STARPU_SIMGRID
-	STARPU_MPI_ASSERT_MSG(comm == MPI_COMM_WORLD, "StarPU-SMPI only works with MPI_COMM_WORLD for now");
-	*size = _mpi_world_size;
-	return 0;
-#else
-	return MPI_Comm_size(comm, size);
-#endif
-}
-
-int starpu_mpi_comm_rank(MPI_Comm comm, int *rank)
-{
-#ifdef STARPU_SIMGRID
-	STARPU_MPI_ASSERT_MSG(comm == MPI_COMM_WORLD, "StarPU-SMPI only works with MPI_COMM_WORLD for now");
-	*rank = _mpi_world_rank;
-	return 0;
-#else
-	return MPI_Comm_rank(comm, rank);
-#endif
+	return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
 }
 
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
@@ -967,7 +1005,8 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 	}
 
 	starpu_mpi_comm_rank(comm, &me);
-	if (node == rank) return;
+	if (node == rank)
+		return;
 
 	tag = starpu_mpi_data_get_tag(data_handle);
 	if (tag == -1)
@@ -978,8 +1017,8 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 	if (me == node)
 	{
 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
-		void* already_received = _starpu_mpi_cache_received_data_set(data_handle);
-		if (already_received == NULL)
+		int already_received = _starpu_mpi_cache_received_data_set(data_handle);
+		if (already_received == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
 			starpu_mpi_irecv_detached(data_handle, rank, tag, comm, callback, arg);
@@ -988,8 +1027,8 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 	else if (me == rank)
 	{
 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
-		void* already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
-		if (already_sent == NULL)
+		int already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
+		if (already_sent == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
 			starpu_mpi_isend_detached(data_handle, node, tag, comm, NULL, NULL);
@@ -1008,7 +1047,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	}
 
 	starpu_mpi_comm_rank(comm, &me);
-	if (node == rank) return;
+	if (node == rank)
+		return;
 
 	tag = starpu_mpi_data_get_tag(data_handle);
 	if (tag == -1)
@@ -1020,8 +1060,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	{
 		MPI_Status status;
 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
-		void* already_received = _starpu_mpi_cache_received_data_set(data_handle);
-		if (already_received == NULL)
+		int already_received = _starpu_mpi_cache_received_data_set(data_handle);
+		if (already_received == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
 			starpu_mpi_recv(data_handle, rank, tag, comm, &status);
@@ -1030,11 +1070,54 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	else if (me == rank)
 	{
 		_STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
-		void* already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
-		if (already_sent == NULL)
+		int already_sent = _starpu_mpi_cache_sent_data_set(data_handle, node);
+		if (already_sent == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
 			starpu_mpi_send(data_handle, node, tag, comm);
 		}
 	}
 }
+
+void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	int size, i;
+	starpu_mpi_comm_size(comm, &size);
+#ifdef STARPU_DEVEL
+#warning TODO: use binary communication tree to optimize broadcast
+#endif
+	for (i = 0; i < size; i++)
+		starpu_mpi_get_data_on_node_detached(comm, data_handle, i, NULL, NULL);
+}
+
+void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_rank)
+{
+	int old_rank = starpu_mpi_data_get_rank(data);
+	if (new_rank == old_rank)
+		/* Already there */
+		return;
+
+	/* First submit data migration if it's not already on destination */
+	starpu_mpi_get_data_on_node_detached(comm, data, new_rank, NULL, NULL);
+
+	/* And note new owner */
+	starpu_mpi_data_set_rank_comm(data, new_rank, comm);
+
+	/* Flush cache in all other nodes */
+	/* TODO: Ideally we'd transmit the knowledge of who owns it */
+	starpu_mpi_cache_flush(comm, data);
+	return;
+}
+
+int starpu_mpi_wait_for_all(MPI_Comm comm)
+{
+	int mpi = 1;
+	int task = 1;
+	while (task || mpi)
+	{
+		task = _starpu_task_wait_for_all_and_return_nb_waited_tasks();
+		mpi = starpu_mpi_barrier(comm);
+	}
+	return 0;
+}
+

+ 32 - 5
nmad/src/starpu_mpi_private.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2014-2016  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,11 +15,38 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <starpu_mpi_private.h>
+
 int _starpu_debug_rank=-1;
-int _starpu_debug_level=0;
+int _starpu_debug_level_min=0;
+int _starpu_debug_level_max=0;
+int _starpu_mpi_tag = 42;
+int _starpu_mpi_comm_debug;
+
+void _starpu_mpi_set_debug_level_min(int level)
+{
+	_starpu_debug_level_min = level;
+}
+
+void _starpu_mpi_set_debug_level_max(int level)
+{
+	_starpu_debug_level_max = level;
+}
+
+int starpu_mpi_get_communication_tag(void)
+{
+	return _starpu_mpi_tag;
+}
 
-void _starpu_mpi_set_debug_level(int level)
+void starpu_mpi_set_communication_tag(int tag)
 {
-	_starpu_debug_level = level;
+	_starpu_mpi_tag = tag;
 }
 
+char *_starpu_mpi_get_mpi_error_code(int code)
+{
+	static char str[MPI_MAX_OBJECT_NAME];
+	int len;
+	MPI_Error_string(code, str, &len);
+	return str;
+}

+ 115 - 35
nmad/src/starpu_mpi_private.h

@@ -20,9 +20,12 @@
 
 #include <starpu.h>
 #include <common/config.h>
-#include "starpu_mpi.h"
-#include "starpu_mpi_fxt.h"
+#include <common/uthash.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_fxt.h>
 #include <common/list.h>
+#include <common/prio_list.h>
+#include <core/simgrid.h>
 #include <pioman.h>
 #include <nm_sendrecv_interface.h>
 #include <nm_session_interface.h>
@@ -32,12 +35,34 @@ extern "C"
 {
 #endif
 
+#ifdef STARPU_SIMGRID
+starpu_pthread_wait_t wait;
+starpu_pthread_queue_t dontsleep;
+
+struct _starpu_simgrid_mpi_req
+{
+	MPI_Request *request;
+	MPI_Status *status;
+	starpu_pthread_queue_t *queue;
+	unsigned *done;
+};
+
+int _starpu_mpi_simgrid_mpi_test(unsigned *done, int *flag);
+void _starpu_mpi_simgrid_wait_req(MPI_Request *request, 	MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done);
+#endif
+
 extern int _starpu_debug_rank;
+char *_starpu_mpi_get_mpi_error_code(int code);
+extern int _starpu_mpi_comm_debug;
 
 #ifdef STARPU_VERBOSE
-extern int _starpu_debug_level;
-void _starpu_mpi_set_debug_level(int level);
+extern int _starpu_debug_level_min;
+extern int _starpu_debug_level_max;
+void _starpu_mpi_set_debug_level_min(int level);
+void _starpu_mpi_set_debug_level_max(int level);
 #endif
+extern int _starpu_mpi_fake_world_size;
+extern int _starpu_mpi_fake_world_rank;
 
 #ifdef STARPU_NO_ASSERT
 #  define STARPU_MPI_ASSERT_MSG(x, msg, ...)	do { if (0) { (void) (x); }} while(0)
@@ -67,42 +92,63 @@ int _starpu_debug_rank;
 
 #  endif
 #endif
-	
+
 #define _STARPU_MPI_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) size); } while (0)
 #define _STARPU_MPI_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
-#define _STARPU_MPI_REALLOC(ptr, size) do { ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); } while (0)
+#define _STARPU_MPI_REALLOC(ptr, size) do { void *_new_ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(_new_ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); ptr = _new_ptr; } while (0)
 
 #ifdef STARPU_VERBOSE
+#  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) \
+	do								\
+	{							\
+	     	if (_starpu_mpi_comm_debug)			\
+		{					\
+     			int __size;			\
+			char _comm_name[128];		\
+			int _comm_name_len;		\
+			int _rank;			    \
+			starpu_mpi_comm_rank(comm, &_rank); \
+			MPI_Type_size(datatype, &__size);		\
+			MPI_Comm_get_name(comm, _comm_name, &_comm_name_len); \
+			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%d:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
+			fflush(stderr);					\
+		}							\
+	} while(0);
+#  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) 	    _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
+#  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
 	do \
 	{								\
-		if (!_starpu_silent && level <= _starpu_debug_level)	\
+		if (!_starpu_silent && _starpu_debug_level_min <= level && level <= _starpu_debug_level_max)	\
 		{							\
-			if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
-			fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ,## __VA_ARGS__); \
+			if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
+			fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
 			fflush(stderr); \
 		}			\
 	} while(0);
 #else
-#  define _STARPU_MPI_DEBUG(level, fmt, ...)
+#  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
+#  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm)     do { } while(0)
+#  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm) do { } while(0)
+#  define _STARPU_MPI_DEBUG(level, fmt, ...)		do { } while(0)
 #endif
 
 #define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
-	       				     if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
-                                             fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ,## __VA_ARGS__); \
+	       				     if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
+                                             fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
                                              fflush(stderr); }} while(0);
-#define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
+#define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
                                              fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
                                              fflush(stderr); } while(0);
 
-#ifdef STARPU_VERBOSE0
+#ifdef STARPU_VERBOSE
 #  define _STARPU_MPI_LOG_IN()             do { if (!_starpu_silent) { \
-                                               if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank);                        \
-                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] -->\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ); \
+                                               if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank);                        \
+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] -->\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__); \
                                                fflush(stderr); }} while(0)
 #  define _STARPU_MPI_LOG_OUT()            do { if (!_starpu_silent) { \
-                                               if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank);                        \
-                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] <--\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ); \
+                                               if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank);                        \
+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] <--\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__, __LINE__ ); \
                                                fflush(stderr); }} while(0)
 #else
 #  define _STARPU_MPI_LOG_IN()
@@ -116,7 +162,23 @@ enum _starpu_mpi_request_type
 	WAIT_REQ=2,
 	TEST_REQ=3,
 	BARRIER_REQ=4,
-	PROBE_REQ=5
+	PROBE_REQ=5,
+	UNKNOWN_REQ=6,
+};
+
+struct _starpu_mpi_node_tag
+{
+	MPI_Comm comm;
+	int rank;
+	int data_tag;
+};
+
+struct _starpu_mpi_data
+{
+	int magic;
+	struct _starpu_mpi_node_tag node_tag;
+	int *cache_sent;
+	int cache_received;
 };
 
 LIST_TYPE(_starpu_mpi_req,
@@ -127,15 +189,14 @@ LIST_TYPE(_starpu_mpi_req,
 
 	/* description of the data to be sent/received */
 	MPI_Datatype datatype;
+	char *datatype_name;
 	void *ptr;
 	starpu_ssize_t count;
-	int user_datatype;
+	int registered_datatype;
 
 	/* who are we talking to ? */
+	struct _starpu_mpi_node_tag node_tag;
 	nm_gate_t gate;
-	MPI_Comm comm;
-	int mpi_tag;
-	int srcdst;
 	nm_session_t session;
 
 	void (*func)(struct _starpu_mpi_req *);
@@ -152,36 +213,55 @@ LIST_TYPE(_starpu_mpi_req,
 
 	unsigned submitted;
 	unsigned completed;
-
+	unsigned posted;
 
 	/* in the case of detached requests */
-	unsigned detached;
+	int detached;
 	void *callback_arg;
 	void (*callback)(void *);
 
         /* in the case of user-defined datatypes, we need to send the size of the data */
 	nm_sr_request_t size_req;
 
+	int sequential_consistency;
+
 	long pre_sync_jobid;
 	long post_sync_jobid;
 
-
 	int waited;
+
+#ifdef STARPU_SIMGRID
+        MPI_Status status_store;
+	starpu_pthread_queue_t queue;
+	unsigned done;
+#endif
 );
+PRIO_LIST_TYPE(_starpu_mpi_req, prio)
 
-struct _starpu_mpi_data
+struct _starpu_mpi_argc_argv
 {
-	int tag;
-	int rank;
+	int initialize_mpi;
+	int *argc;
+	char ***argv;
 	MPI_Comm comm;
+	int fargc;	// Fortran argc
+	char **fargv;	// Fortran argv
+	int rank;
+	int world_size;
 };
 
-#define _starpu_mpi_req_status(PUBLIC_REQ,STATUS) do {\
-  STATUS->MPI_SOURCE=PUBLIC_REQ->srcdst; /**< field name mandatory by spec */\
-  STATUS->MPI_TAG=PUBLIC_REQ->mpi_tag;    /**< field name mandatory by spec */\
-  STATUS->MPI_ERROR=PUBLIC_REQ->ret;  /**< field name mandatory by spec */\
-  STATUS->size=PUBLIC_REQ->count;       /**< size of data received */\
-  STATUS->cancelled=0;  /**< whether request was cancelled */\
+void _starpu_mpi_progress_shutdown(int *value);
+int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv);
+#ifdef STARPU_SIMGRID
+void _starpu_mpi_wait_for_initialization();
+#endif
+
+#define _starpu_mpi_req_status(PUBLIC_REQ,STATUS) do {			\
+	STATUS->MPI_SOURCE=PUBLIC_REQ->node_tag.rank; /**< field name mandatory by spec */ \
+	STATUS->MPI_TAG=PUBLIC_REQ->node_tag.data_tag;    /**< field name mandatory by spec */ \
+	STATUS->MPI_ERROR=PUBLIC_REQ->ret;  /**< field name mandatory by spec */ \
+	STATUS->size=PUBLIC_REQ->count;       /**< size of data received */ \
+	STATUS->cancelled=0;  /**< whether request was cancelled */	\
 } while(0)
 
 #ifdef __cplusplus

+ 19 - 14
nmad/src/starpu_mpi_stats.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -32,19 +32,21 @@ void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 		stats_enabled = 0;
 	}
 
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
-	if (!_starpu_silent) fprintf(stderr,"Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
+	_STARPU_DISP("Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
 
-	MPI_Comm_size(comm, &world_size);
+	starpu_mpi_comm_size(comm, &world_size);
 	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
 
-	comm_amount = (size_t *) calloc(world_size, sizeof(size_t));
+	_STARPU_MPI_CALLOC(comm_amount, world_size, sizeof(size_t));
 }
 
-void _starpu_mpi_comm_amounts_free()
+void _starpu_mpi_comm_amounts_shutdown()
 {
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 	free(comm_amount);
 }
 
@@ -52,9 +54,10 @@ void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype data
 {
 	int src, size;
 
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
-	MPI_Comm_rank(comm, &src);
+	starpu_mpi_comm_rank(comm, &src);
 	MPI_Type_size(datatype, &size);
 
 	_STARPU_MPI_DEBUG(1, "[%d] adding %d to %d\n", src, count*size, dst);
@@ -64,29 +67,31 @@ void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype data
 
 void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
 {
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 	memcpy(comm_amounts, comm_amount, world_size * sizeof(size_t));
 }
 
-void _starpu_mpi_comm_amounts_display(int node)
+void _starpu_mpi_comm_amounts_display(FILE *stream, int node)
 {
 	int dst;
 	size_t sum = 0;
 
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
 	for (dst = 0; dst < world_size; dst++)
 	{
 		sum += comm_amount[dst];
 	}
 
-	fprintf(stderr, "\n[starpu_comm_stats][%d] TOTAL:\t%f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
+	fprintf(stream, "\n[starpu_comm_stats][%d] TOTAL:\t%f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
 
 	for (dst = 0; dst < world_size; dst++)
 	{
 		if (comm_amount[dst])
 		{
-			fprintf(stderr, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\n",
+			fprintf(stream, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\n",
 				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
 		}
 	}

+ 6 - 4
nmad/src/starpu_mpi_stats.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,17 +17,19 @@
 #ifndef __STARPU_MPI_STATS_H__
 #define __STARPU_MPI_STATS_H__
 
+#include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
-void _starpu_mpi_comm_amounts_free();
+void _starpu_mpi_comm_amounts_shutdown();
 void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count);
-void _starpu_mpi_comm_amounts_display(int node);
+void _starpu_mpi_comm_amounts_display(FILE *stream, int node);
 
 #ifdef __cplusplus
 }

+ 2 - 2
nmad/src/starpu_mpi_task_insert.c

@@ -34,9 +34,9 @@
 
 #define _SEND_DATA(data, mode, dest, data_tag, prio, comm, callback, arg)     \
 	do {									\
-	if (mode & STARPU_SSEND)					\
+		if (mode & STARPU_SSEND)					\
 			starpu_mpi_issend_detached_prio(data, dest, data_tag, prio, comm, callback, arg); 	\
-	else								\
+		else												\
 			starpu_mpi_isend_detached_prio(data, dest, data_tag, prio, comm, callback, arg);	\
 	} while (0)
 

+ 463 - 0
nmad/src/starpu_mpi_task_insert_fortran.c

@@ -0,0 +1,463 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016, 2017  CNRS
+ * Copyright (C) 2016 Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <starpu_mpi.h>
+#include <common/config.h>
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_task_insert.h>
+#include <starpu_mpi_select_node.h>
+#include <util/starpu_task_insert_utils.h>
+
+#ifdef HAVE_MPI_COMM_F2C
+static
+int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, void **arglist)
+{
+	int arg_i = 0;
+	int inconsistent_execute = 0;
+	int node_selected = 0;
+	int nb_allocated_data = 16;
+	struct starpu_data_descr *descrs;
+	int nb_data;
+	int prio = 0;
+	int select_node_policy = STARPU_MPI_NODE_SELECTION_CURRENT_POLICY;
+
+	_STARPU_TRACE_TASK_MPI_DECODE_START();
+
+	_STARPU_MPI_MALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
+	nb_data = 0;
+	*do_execute = -1;
+	*xrank = -1;
+
+	while (arglist[arg_i] != NULL)
+	{
+		int arg_type = (int)(intptr_t)arglist[arg_i];
+		int arg_type_nocommute = arg_type & ~STARPU_COMMUTE;
+
+		if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
+			arg_i++;
+			*xrank = *(int *)arglist[arg_i];
+			if (node_selected == 0)
+			{
+				_STARPU_MPI_DEBUG(100, "Executing on node %d\n", *xrank);
+				*do_execute = 1;
+				node_selected = 1;
+				inconsistent_execute = 0;
+			}
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
+			arg_i++;
+			starpu_data_handle_t data = arglist[arg_i];
+			if (node_selected == 0)
+			{
+				*xrank = starpu_mpi_data_get_rank(data);
+				STARPU_ASSERT_MSG(*xrank != -1, "Rank of the data must be set using starpu_mpi_data_register() or starpu_data_set_rank()");
+				_STARPU_MPI_DEBUG(100, "Executing on data node %d\n", *xrank);
+				STARPU_ASSERT_MSG(*xrank <= nb_nodes, "Node %d to execute codelet is not a valid node (%d)", *xrank, nb_nodes);
+				*do_execute = 1;
+				node_selected = 1;
+				inconsistent_execute = 0;
+			}
+		}
+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
+		{
+			arg_i++;
+			starpu_data_handle_t data = arglist[arg_i];
+			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
+			if (node_selected == 0)
+			{
+				int ret = _starpu_mpi_find_executee_node(data, mode, me, do_execute, &inconsistent_execute, xrank);
+				if (ret == -EINVAL)
+				{
+					free(descrs);
+					_STARPU_TRACE_TASK_MPI_DECODE_END();
+					return ret;
+				}
+			}
+			if (nb_data >= nb_allocated_data)
+			{
+				nb_allocated_data *= 2;
+				_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
+			}
+			descrs[nb_data].handle = data;
+			descrs[nb_data].mode = mode;
+			nb_data ++;
+		}
+		else if (arg_type == STARPU_DATA_ARRAY)
+		{
+			arg_i++;
+			starpu_data_handle_t *datas = arglist[arg_i];
+			arg_i++;
+			int nb_handles = *(int *)arglist[arg_i];
+			int i;
+
+			for(i=0 ; i<nb_handles ; i++)
+			{
+				STARPU_ASSERT_MSG(codelet->nbuffers == STARPU_VARIABLE_NBUFFERS || nb_data < codelet->nbuffers, "Too many data passed to starpu_mpi_task_insert");
+				enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(codelet, nb_data);
+				if (node_selected == 0)
+				{
+					int ret = _starpu_mpi_find_executee_node(datas[i], mode, me, do_execute, &inconsistent_execute, xrank);
+					if (ret == -EINVAL)
+					{
+						free(descrs);
+						_STARPU_TRACE_TASK_MPI_DECODE_END();
+						return ret;
+					}
+				}
+				if (nb_data >= nb_allocated_data)
+				{
+					nb_allocated_data *= 2;
+					_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
+				}
+				descrs[nb_data].handle = datas[i];
+				descrs[nb_data].mode = mode;
+				nb_data ++;
+			}
+		}
+		else if (arg_type == STARPU_DATA_MODE_ARRAY)
+		{
+			arg_i++;
+			struct starpu_data_descr *_descrs = arglist[arg_i];
+			arg_i++;
+			int nb_handles = *(int *)arglist[arg_i];
+			int i;
+
+			for(i=0 ; i<nb_handles ; i++)
+			{
+				enum starpu_data_access_mode mode = _descrs[i].mode;
+				if (node_selected == 0)
+				{
+					int ret = _starpu_mpi_find_executee_node(_descrs[i].handle, mode, me, do_execute, &inconsistent_execute, xrank);
+					if (ret == -EINVAL)
+					{
+						free(descrs);
+						_STARPU_TRACE_TASK_MPI_DECODE_END();
+						return ret;
+					}
+				}
+				if (nb_data >= nb_allocated_data)
+				{
+					nb_allocated_data *= 2;
+					_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
+				}
+				descrs[nb_data].handle = _descrs[i].handle;
+				descrs[nb_data].mode = mode;
+				nb_data ++;
+			}
+		}
+		else if (arg_type==STARPU_VALUE)
+		{
+			arg_i++;
+			/* void* */
+			arg_i++;
+			/* size_t */
+		}
+		else if (arg_type==STARPU_CL_ARGS)
+		{
+			arg_i++;
+			/* void* */
+			arg_i++;
+			/* size_t */
+		}
+		else if (arg_type==STARPU_CALLBACK)
+		{
+			arg_i++;
+			/* _starpu_callback_func_t */
+		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+		{
+			arg_i++;
+			/* _starpu_callback_func_t */
+			arg_i++;
+			/* void* */
+		}
+		else if (arg_type==STARPU_CALLBACK_ARG)
+		{
+			arg_i++;
+			/* void* */
+		}
+		else if (arg_type==STARPU_PRIORITY)
+		{
+			prio = *(int *)arglist[arg_i];
+			arg_i++;
+			/* int* */
+		}
+		/* STARPU_EXECUTE_ON_NODE handled above */
+		/* STARPU_EXECUTE_ON_DATA handled above */
+		/* STARPU_DATA_ARRAY handled above */
+		/* STARPU_DATA_MODE_ARRAY handled above */
+		else if (arg_type==STARPU_TAG)
+		{
+			arg_i++;
+			/* starpu_tag_t* */
+		}
+		else if (arg_type==STARPU_HYPERVISOR_TAG)
+		{
+			arg_i++;
+			/* int* */
+		}
+		else if (arg_type==STARPU_FLOPS)
+		{
+			arg_i++;
+			/* double* */
+		}
+		else if (arg_type==STARPU_SCHED_CTX)
+		{
+			arg_i++;
+			/* unsigned* */
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
+                {
+			arg_i++;
+			/* _starpu_callback_func_t */
+		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
+                {
+			arg_i++;
+			/* void* */
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+                {
+			arg_i++;
+			/* _starpu_callback_func_t */
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+                {
+			arg_i++;
+			/* void* */
+		}
+		else if (arg_type==STARPU_EXECUTE_WHERE)
+		{
+			arg_i++;
+			/* int* */
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_WORKER)
+		{
+			arg_i++;
+			/* int* */
+		}
+		else if (arg_type==STARPU_TAG_ONLY)
+		{
+			arg_i++;
+			/* starpu_tag_t* */
+		}
+		else if (arg_type==STARPU_NAME)
+		{
+			arg_i++;
+			/* char* */
+		}
+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
+		{
+			arg_i++;
+			/* unsigned* */
+		}
+		else if (arg_type==STARPU_WORKER_ORDER)
+		{
+			arg_i++;
+			/* unsigned* */
+		}
+		else if (arg_type==STARPU_NODE_SELECTION_POLICY)
+		{
+			arg_i++;
+			/* int* */
+		}
+		else
+		{
+			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
+		}
+
+		arg_i++;
+	}
+
+	if (inconsistent_execute == 1 || *xrank == -1)
+	{
+		// We need to find out which node is going to execute the codelet.
+		_STARPU_MPI_DISP("Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
+		*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(100, "Inconsistent=%d - xrank=%d\n", inconsistent_execute, *xrank);
+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
+	}
+	_STARPU_MPI_DEBUG(100, "do_execute=%d\n", *do_execute);
+
+	*descrs_p = descrs;
+	*nb_data_p = nb_data;
+	*prio_p = prio;
+
+	_STARPU_TRACE_TASK_MPI_DECODE_END();
+	return 0;
+}
+
+static
+int _fstarpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, void **arglist)
+{
+	int me, do_execute, xrank, nb_nodes;
+	int ret;
+	int i;
+	struct starpu_data_descr *descrs;
+	int nb_data;
+	int prio;
+
+	_STARPU_MPI_LOG_IN();
+
+	starpu_mpi_comm_rank(comm, &me);
+	starpu_mpi_comm_size(comm, &nb_nodes);
+
+	/* Find out whether we are to execute the data because we own the data to be written to. */
+	ret = _fstarpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, arglist);
+	if (ret < 0)
+		return ret;
+
+	_STARPU_TRACE_TASK_MPI_PRE_START();
+	/* Send and receive data as requested */
+	for(i=0 ; i<nb_data ; i++)
+	{
+		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
+	}
+
+	if (xrank_p)
+		*xrank_p = xrank;
+	if (nb_data_p)
+		*nb_data_p = nb_data;
+	if (prio_p)
+		*prio_p = prio;
+
+	if (descrs_p)
+		*descrs_p = descrs;
+	else
+		free(descrs);
+	_STARPU_TRACE_TASK_MPI_PRE_END();
+
+	if (do_execute == 0)
+	{
+		return 1;
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(100, "Execution of the codelet %p (%s)\n", codelet, codelet?codelet->name:NULL);
+
+		*task = starpu_task_create();
+		(*task)->cl_arg_free = 1;
+
+		_fstarpu_task_insert_create(codelet, task, arglist);
+		return 0;
+	}
+}
+
+static
+int _fstarpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, void **arglist)
+{
+	struct starpu_task *task;
+	int ret;
+	int xrank;
+	int do_execute = 0;
+	struct starpu_data_descr *descrs;
+	int nb_data;
+	int prio;
+
+	ret = _fstarpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, &prio, arglist);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+	{
+		do_execute = 1;
+		ret = starpu_task_submit(task);
+
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			_STARPU_MSG("submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
+				    task, task->cl,
+				    (codelet == NULL) ? "none" :
+				    task->cl->name ? task->cl->name :
+				    (task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
+
+			task->destroy = 0;
+			starpu_task_destroy(task);
+		}
+	}
+	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
+}
+
+int fstarpu_mpi_task_insert(MPI_Fint comm, void ***_arglist)
+{
+	void **arglist = *_arglist;
+	struct starpu_codelet *codelet = arglist[0];
+	if (codelet == NULL)
+	{
+		STARPU_ABORT_MSG("task without codelet");
+	}
+	int ret;
+
+	ret = _fstarpu_mpi_task_insert_v(MPI_Comm_f2c(comm), codelet, arglist+1);
+	return ret;
+}
+
+/* fstarpu_mpi_insert_task: aliased to fstarpu_mpi_task_insert in fstarpu_mpi_mod.f90 */
+
+struct starpu_task *fstarpu_mpi_task_build(MPI_Fint comm, void ***_arglist)
+{
+	void **arglist = *_arglist;
+	struct starpu_codelet *codelet = arglist[0];
+	if (codelet == NULL)
+	{
+		STARPU_ABORT_MSG("task without codelet");
+	}
+	struct starpu_task *task;
+	int ret;
+
+	ret = _fstarpu_mpi_task_build_v(MPI_Comm_f2c(comm), codelet, &task, NULL, NULL, NULL, NULL, arglist+1);
+	STARPU_ASSERT(ret >= 0);
+	return (ret > 0) ? NULL : task;
+}
+
+int fstarpu_mpi_task_post_build(MPI_Fint _comm, void ***_arglist)
+{
+	void **arglist = *_arglist;
+	struct starpu_codelet *codelet = arglist[0];
+	if (codelet == NULL)
+	{
+		STARPU_ABORT_MSG("task without codelet");
+	}
+	MPI_Comm comm = MPI_Comm_f2c(_comm);
+	int xrank, do_execute;
+	int ret, me, nb_nodes;
+	struct starpu_data_descr *descrs;
+	int nb_data;
+	int prio;
+
+	starpu_mpi_comm_rank(comm, &me);
+	starpu_mpi_comm_size(comm, &nb_nodes);
+
+	/* Find out whether we are to execute the data because we own the data to be written to. */
+	ret = _fstarpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, arglist);
+	if (ret < 0)
+		return ret;
+
+	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
+}
+
+#endif /* HAVE_MPI_COMM_F2C */
+
+
+