Browse Source

Merge remote-tracking branch 'gitlab/master' into ft_checkpoint

# Conflicts:
#	src/debug/traces/starpu_fxt.c
Romain LION 5 years ago
parent
commit
769d5b98a8

+ 1 - 0
ChangeLog

@@ -52,6 +52,7 @@ StarPU 1.3.5 (git revision xxx)
 
 
 Small changes:
 Small changes:
   * Move MPI cache functions into the public API
   * Move MPI cache functions into the public API
+  * Add STARPU_MPI_NOBIND environment variable.
 
 
 StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
 StarPU 1.3.4 (git revision c37a5d024cd997596da41f765557c58099baf896)
 ====================================================================
 ====================================================================

+ 13 - 2
doc/doxygen/chapters/470_simgrid.doxy

@@ -167,8 +167,12 @@ theory results), see the \ref STARPU_SIMGRID_TRANSFER_COST, \ref STARPU_SIMGRID_
 
 
 \section SimulationMPIApplications MPI Applications
 \section SimulationMPIApplications MPI Applications
 
 
-StarPU-MPI applications can also be run in SimGrid mode. It needs to be compiled
-with \c smpicc, and run using the <c>starpu_smpirun</c> script, for instance:
+StarPU-MPI applications can also be run in SimGrid mode. smpi currently requires
+that StarPU be build statically only, so <c>--disable-shared</c> needs to be
+passed to <c>./configure</c>.
+
+The application needs to be compiled with \c smpicc, and run using the
+<c>starpu_smpirun</c> script, for instance:
 
 
 \verbatim
 \verbatim
 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
@@ -182,6 +186,13 @@ in case of a heterogeneous platform, it is possible to use the
 option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
 option <c>-hostfile-platform</c> in <c>starpu_smpirun</c>, that will define
 \ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 \ref STARPU_MPI_HOSTNAMES with the hostnames of your hostfile.
 
 
+So as to use FxT traces, libfxt also needs to be built statically, <b>and</b>
+with dynamic linking flags, i.e. with
+
+\verbatim
+CFLAGS=-fPIC ./configure --enable-static
+\endverbatim
+
 \section SimulationDebuggingApplications Debugging Applications
 \section SimulationDebuggingApplications Debugging Applications
 
 
 By default, SimGrid uses its own implementation of threads, which prevents \c gdb
 By default, SimGrid uses its own implementation of threads, which prevents \c gdb

+ 8 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -255,6 +255,14 @@ it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
 workers.
 workers.
 </dd>
 </dd>
 
 
+<dt>STARPU_MPI_NOBIND</dt>
+<dd>
+\anchor STARPU_MPI_NOBIND
+\addindex __env__STARPU_MPI_NOBIND
+Setting it to non-zero will prevent StarPU from binding the MPI to
+a separate core. This is for instance useful when running the testsuite on a single system.
+</dd>
+
 <dt>STARPU_WORKERS_CUDAID</dt>
 <dt>STARPU_WORKERS_CUDAID</dt>
 <dd>
 <dd>
 \anchor STARPU_WORKERS_CUDAID
 \anchor STARPU_WORKERS_CUDAID

+ 1 - 0
examples/api/block_data_interface.c

@@ -19,6 +19,7 @@
 
 
 #define starpu_interface_block_ops my_starpu_interface_block_ops
 #define starpu_interface_block_ops my_starpu_interface_block_ops
 #define starpu_block_data_register my_starpu_block_data_register
 #define starpu_block_data_register my_starpu_block_data_register
+#define starpu_block_ptr_register my_starpu_block_ptr_register
 #define starpu_block_get_nx my_starpu_block_get_nx
 #define starpu_block_get_nx my_starpu_block_get_nx
 #define starpu_block_get_ny my_starpu_block_get_ny
 #define starpu_block_get_ny my_starpu_block_get_ny
 #define starpu_block_get_nz my_starpu_block_get_nz
 #define starpu_block_get_nz my_starpu_block_get_nz

+ 1 - 0
examples/api/csr_data_interface.c

@@ -22,6 +22,7 @@
 #define starpu_csr_get_nnz my_starpu_csr_get_nnz
 #define starpu_csr_get_nnz my_starpu_csr_get_nnz
 #define starpu_csr_get_nrow my_starpu_csr_get_nrow
 #define starpu_csr_get_nrow my_starpu_csr_get_nrow
 #define starpu_csr_get_firstentry my_starpu_csr_get_firstentry
 #define starpu_csr_get_firstentry my_starpu_csr_get_firstentry
+#define starpu_csr_get_elemsize my_starpu_csr_get_elemsize
 #define starpu_csr_get_local_nzval my_starpu_csr_get_local_nzval
 #define starpu_csr_get_local_nzval my_starpu_csr_get_local_nzval
 #define starpu_csr_get_local_colind my_starpu_csr_get_local_colind
 #define starpu_csr_get_local_colind my_starpu_csr_get_local_colind
 #define starpu_csr_get_local_rowptr my_starpu_csr_get_local_rowptr
 #define starpu_csr_get_local_rowptr my_starpu_csr_get_local_rowptr

+ 10 - 16
include/starpu_fxt.h

@@ -69,42 +69,36 @@ struct starpu_fxt_options
 	char *number_events_path;
 	char *number_events_path;
 	char *anim_path;
 	char *anim_path;
 	char *states_path;
 	char *states_path;
+	char worker_names[STARPU_NMAXWORKERS][256];
+	int nworkers;
+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
 
 
 	/**
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
 	   In case we are going to gather multiple traces (e.g in the case of
 	   MPI processes), we may need to prefix the name of the containers.
 	   MPI processes), we may need to prefix the name of the containers.
 	*/
 	*/
 	char *file_prefix;
 	char *file_prefix;
+
 	/**
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
 	   In case we are going to gather multiple traces (e.g in the case of
-	   MPI processes), we may need to prefix the name of the containers.
+	   MPI processes), this variable stores the time offset with the rank 0.
 	*/
 	*/
 	uint64_t file_offset;
 	uint64_t file_offset;
+
 	/**
 	/**
 	   In case we are going to gather multiple traces (e.g in the case of
 	   In case we are going to gather multiple traces (e.g in the case of
-	   MPI processes), we may need to prefix the name of the containers.
+	   MPI processes), this variable stores the MPI rank of the trace file.
 	*/
 	*/
 	int file_rank;
 	int file_rank;
 
 
 	/**
 	/**
-	   Output parameters
-	*/
-	char worker_names[STARPU_NMAXWORKERS][256];
-	/**
-	   Output parameters
-	*/
-	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
-	/**
-	   Output parameters
-	*/
-	int nworkers;
-
-	/**
 	   In case we want to dump the list of codelets to an external tool
 	   In case we want to dump the list of codelets to an external tool
 	*/
 	*/
 	struct starpu_fxt_codelet_event **dumped_codelets;
 	struct starpu_fxt_codelet_event **dumped_codelets;
+
 	/**
 	/**
-	   In case we want to dump the list of codelets to an external tool
+	   In case we want to dump the list of codelets to an external tool, number
+	   of dumped codelets.
 	*/
 	*/
 	long dumped_codelets_count;
 	long dumped_codelets_count;
 };
 };

+ 2 - 2
mpi/examples/Makefile.am

@@ -47,10 +47,10 @@ endif
 endif
 endif
 
 
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 else
 else
-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 endif
 
 
 if STARPU_MPI_CHECK
 if STARPU_MPI_CHECK

+ 2 - 0
mpi/examples/benchs/abstract_sendrecv_bench.c

@@ -25,6 +25,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
 
 	if (mpi_rank >= 2)
 	if (mpi_rank >= 2)
 	{
 	{
+		starpu_pause();
 		if (thread_barrier != NULL)
 		if (thread_barrier != NULL)
 		{
 		{
 			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
 			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
@@ -41,6 +42,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 				starpu_mpi_barrier(MPI_COMM_WORLD);
 				starpu_mpi_barrier(MPI_COMM_WORLD);
 			}
 			}
 		}
 		}
+		starpu_resume();
 
 
 		return;
 		return;
 	}
 	}

+ 12 - 3
mpi/examples/mpi_lu/plu_example.c

@@ -37,8 +37,8 @@
 static unsigned long size = 4096;
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned nblocks = 16;
 static unsigned check = 0;
 static unsigned check = 0;
-static int p = 1;
-static int q = 1;
+static int p = -1;
+static int q = -1;
 static unsigned display = 0;
 static unsigned display = 0;
 static unsigned no_prio = 0;
 static unsigned no_prio = 0;
 
 
@@ -463,7 +463,14 @@ int main(int argc, char **argv)
 	/* We disable sequential consistency in this example */
 	/* We disable sequential consistency in this example */
 	starpu_data_set_default_sequential_consistency_flag(0);
 	starpu_data_set_default_sequential_consistency_flag(0);
 
 
-	STARPU_ASSERT(p*q == world_size);
+	if (p == -1 && q==-1)
+	{
+		fprintf(stderr, "Setting default values for p and q\n");
+		p = (q % 2 == 0) ? 2 : 1;
+		q = world_size / p;
+
+	}
+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
 
 
 	starpu_cublas_init();
 	starpu_cublas_init();
 
 
@@ -605,6 +612,8 @@ int main(int argc, char **argv)
 				starpu_free(blockptr);
 				starpu_free(blockptr);
 		}
 		}
 	}
 	}
+	free(dataA_handles);
+	free(dataA);
 
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

+ 2 - 0
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -387,6 +387,8 @@ int main(int argc, char **argv)
 				starpu_free(blockptr);
 				starpu_free(blockptr);
 		}
 		}
 	}
 	}
+	free(dataA_handles);
+	free(dataA);
 
 
 	starpu_cublas_shutdown();
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();

+ 26 - 11
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -39,9 +39,10 @@
 
 
 static unsigned long size = 4096;
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned nblocks = 16;
+static size_t blocksize;
 static unsigned check = 0;
 static unsigned check = 0;
-static int p = 1;
-static int q = 1;
+static int p = -1;
+static int q = -1;
 static unsigned display = 0;
 static unsigned display = 0;
 static unsigned no_prio = 0;
 static unsigned no_prio = 0;
 static char *path = "./starpu-ooc-files";
 static char *path = "./starpu-ooc-files";
@@ -53,6 +54,9 @@ static unsigned numa = 0;
 static size_t allocated_memory = 0;
 static size_t allocated_memory = 0;
 
 
 static starpu_data_handle_t *dataA_handles;
 static starpu_data_handle_t *dataA_handles;
+static void **disk_objs;
+
+static int disk_node;
 
 
 int get_block_rank(unsigned i, unsigned j);
 int get_block_rank(unsigned i, unsigned j);
 
 
@@ -142,7 +146,6 @@ static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnbl
 
 
 static void create_matrix()
 static void create_matrix()
 {
 {
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
 	TYPE *blockptr = malloc(blocksize);
 	TYPE *blockptr = malloc(blocksize);
 	int fd;
 	int fd;
 	char *filename;
 	char *filename;
@@ -195,10 +198,9 @@ static void init_matrix(int rank)
 {
 {
 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
 	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
 	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+	disk_objs = calloc(nblocks*nblocks, sizeof(*disk_objs));
 
 
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
-
-	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
+	disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(16*1024*1024, size*size*sizeof(TYPE)));
 	assert(disk_node >= 0);
 	assert(disk_node >= 0);
 
 
 	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
 	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
@@ -215,21 +217,21 @@ static void init_matrix(int rank)
 
 
 			if (block_rank == rank)
 			if (block_rank == rank)
 			{
 			{
-				void *disk_obj;
 				snprintf(filename, sizeof(filename), "%u,%u", i, j);
 				snprintf(filename, sizeof(filename), "%u,%u", i, j);
 				/* Register it to StarPU */
 				/* Register it to StarPU */
-				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
-				if (!disk_obj)
+				disk_objs[j+nblocks*i] = starpu_disk_open(disk_node, filename, blocksize);
+				if (!disk_objs[j+nblocks*i])
 				{
 				{
 					fprintf(stderr,"could not open %s\n", filename);
 					fprintf(stderr,"could not open %s\n", filename);
 					exit(1);
 					exit(1);
 				}
 				}
 				starpu_matrix_data_register(handleptr, disk_node,
 				starpu_matrix_data_register(handleptr, disk_node,
-					(uintptr_t) disk_obj, size/nblocks,
+					(uintptr_t) disk_objs[j+nblocks*i], size/nblocks,
 					size/nblocks, size/nblocks, sizeof(TYPE));
 					size/nblocks, size/nblocks, sizeof(TYPE));
 			}
 			}
 			else
 			else
 			{
 			{
+				disk_objs[j+nblocks*i] = NULL;
 				starpu_matrix_data_register(handleptr, -1,
 				starpu_matrix_data_register(handleptr, -1,
 					0, size/nblocks,
 					0, size/nblocks,
 					size/nblocks, size/nblocks, sizeof(TYPE));
 					size/nblocks, size/nblocks, sizeof(TYPE));
@@ -273,6 +275,8 @@ int main(int argc, char **argv)
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
+	blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
 	ret = mkdir(path, 0777);
 	ret = mkdir(path, 0777);
 	if (ret != 0 && errno != EEXIST)
 	if (ret != 0 && errno != EEXIST)
 	{
 	{
@@ -286,7 +290,14 @@ int main(int argc, char **argv)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
 
 
-	STARPU_ASSERT(p*q == world_size);
+	if (p == -1 && q==-1)
+	{
+		fprintf(stderr, "Setting default values for p and q\n");
+		p = (q % 2 == 0) ? 2 : 1;
+		q = world_size / p;
+
+	}
+	STARPU_ASSERT_MSG(p*q == world_size, "p=%d, q=%d, world_size=%d\n", p, q, world_size);
 
 
 	starpu_cublas_init();
 	starpu_cublas_init();
 
 
@@ -401,8 +412,12 @@ int main(int argc, char **argv)
 		for (i = 0; i < nblocks; i++)
 		for (i = 0; i < nblocks; i++)
 		{
 		{
 			starpu_data_unregister(dataA_handles[j+nblocks*i]);
 			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+			if (disk_objs[j+nblocks*i])
+				starpu_disk_close(disk_node, disk_objs[j+nblocks*i], blocksize);
 		}
 		}
 	}
 	}
+	free(dataA_handles);
+	free(disk_objs);
 
 
 	starpu_cublas_shutdown();
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();

+ 16 - 10
mpi/src/mpi/starpu_mpi_mpi.c

@@ -206,7 +206,7 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			else
 			else
 			{
 			{
 				STARPU_ASSERT(req->count);
 				STARPU_ASSERT(req->count);
-				_STARPU_MPI_MALLOC(req->ptr, req->count);
+				req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 			}
 			}
 
 
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
@@ -228,12 +228,12 @@ void _starpu_mpi_submit_ready_request(void *arg)
 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
 			/* test whether some data with the given tag and source have already been received by StarPU-MPI*/
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(&req->node_tag);
 
 
-			/* Case: a receive request for a data with the given tag and source has already been
-			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
-			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
-			 * will be called to bring the data back to the original data handle associated to the request.*/
 			if (early_data_handle)
 			if (early_data_handle)
 			{
 			{
+				/* Case: a receive request for a data with the given tag and source has already been
+				 * posted to MPI by StarPU. Asynchronously requests a Read permission over the temporary handle ,
+				 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
+				 * will be called to bring the data back to the original data handle associated to the request.*/
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
 				STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req_mutex));
 				while (!(early_data_handle->req_ready))
 				while (!(early_data_handle->req_ready))
@@ -260,13 +260,13 @@ void _starpu_mpi_submit_ready_request(void *arg)
 				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 				starpu_data_acquire_on_node_cb(early_data_handle->handle,STARPU_MAIN_RAM,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 			}
 			}
-			/* Case: no matching data has been received. Store the receive request as an early_request. */
 			else
 			else
 			{
 			{
 				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.node.rank, req->node_tag.node.comm);
 				struct _starpu_mpi_req *sync_req = _starpu_mpi_sync_data_find(req->node_tag.data_tag, req->node_tag.node.rank, req->node_tag.node.comm);
 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				_STARPU_MPI_DEBUG(3, "----------> Looking for sync data for tag %"PRIi64" and src %d = %p\n", req->node_tag.data_tag, req->node_tag.node.rank, sync_req);
 				if (sync_req)
 				if (sync_req)
 				{
 				{
+					/* Case: we already received the send envelope, we can proceed with the receive */
 					req->sync = 1;
 					req->sync = 1;
 					_starpu_mpi_datatype_allocate(req->data_handle, req);
 					_starpu_mpi_datatype_allocate(req->data_handle, req);
 					if (req->registered_datatype == 1)
 					if (req->registered_datatype == 1)
@@ -278,14 +278,16 @@ void _starpu_mpi_submit_ready_request(void *arg)
 					{
 					{
 						req->count = sync_req->count;
 						req->count = sync_req->count;
 						STARPU_ASSERT(req->count);
 						STARPU_ASSERT(req->count);
-						_STARPU_MPI_MALLOC(req->ptr, req->count);
+						req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 					}
 					}
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
 					_STARPU_MPI_INC_READY_REQUESTS(+1);
+					/* Throw away the dumb request that was only used to know that we got the envelope */
 					_starpu_mpi_request_destroy(sync_req);
 					_starpu_mpi_request_destroy(sync_req);
 				}
 				}
 				else
 				else
 				{
 				{
+					/* Case: no matching data has been received. Store the receive request as an early_request. */
 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
 					_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %"PRIi64") into the request hashmap\n", req, req->node_tag.node.rank, req->node_tag.data_tag);
 					_starpu_mpi_early_request_enqueue(req);
 					_starpu_mpi_early_request_enqueue(req);
 				}
 				}
@@ -687,6 +689,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
 	STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
 
 
+	STARPU_VALGRIND_YIELD();
+
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	ret = req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, flag);
 	ret = req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, flag);
 	if (*flag)
 	if (*flag)
@@ -911,6 +915,8 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
 }
 }
 
 
+/* This is called when the data is now received in the early data handle, we can
+ * now copy it over to the real handle. */
 static void _starpu_mpi_early_data_cb(void* arg)
 static void _starpu_mpi_early_data_cb(void* arg)
 {
 {
 	struct _starpu_mpi_early_data_cb_args *args = arg;
 	struct _starpu_mpi_early_data_cb_args *args = arg;
@@ -1205,14 +1211,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	}
 	}
 
 
-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI") < 0)
 	{
 	{
 		char hostname[65];
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
 		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
 	}
 	_starpu_mpi_do_initialize(argc_argv);
 	_starpu_mpi_do_initialize(argc_argv);
-	if (_starpu_mpi_thread_cpuid >= 0)
+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid >= 0)
 		/* In case MPI changed the binding */
 		/* In case MPI changed the binding */
 		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
 		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
 #else
 #else
@@ -1456,7 +1462,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 						else
 						else
 						{
 						{
 							early_request->count = envelope->size;
 							early_request->count = envelope->size;
-							_STARPU_MPI_MALLOC(early_request->ptr, early_request->count);
+							early_request->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, early_request->count, 0);
 							starpu_memory_allocate(STARPU_MAIN_RAM, early_request->count, STARPU_MEMORY_OVERFLOW);
 							starpu_memory_allocate(STARPU_MAIN_RAM, early_request->count, STARPU_MEMORY_OVERFLOW);
 
 
 							STARPU_MPI_ASSERT_MSG(early_request->ptr, "cannot allocate message of size %ld\n", early_request->count);
 							STARPU_MPI_ASSERT_MSG(early_request->ptr, "cannot allocate message of size %ld\n", early_request->count);

+ 7 - 4
mpi/src/nmad/starpu_mpi_nmad.c

@@ -245,6 +245,8 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
 
+	STARPU_VALGRIND_YIELD();
+
 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
 
 	/* we must do a test_locked to avoid race condition :
 	/* we must do a test_locked to avoid race condition :
@@ -344,7 +346,7 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_ev
 				// req->ptr is freed by starpu_data_unpack
 				// req->ptr is freed by starpu_data_unpack
 				starpu_data_unpack(req->data_handle, req->ptr, req->count);
 				starpu_data_unpack(req->data_handle, req->ptr, req->count);
 			else
 			else
-				free(req->ptr);
+				starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t) req->ptr, req->count, 0);
 		}
 		}
 		else
 		else
 		{
 		{
@@ -451,7 +453,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
+	if (!_starpu_mpi_nobind && starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
 	{
 	{
 		char hostname[65];
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
@@ -623,7 +625,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 	 * required for piom_ltask_set_bound_thread_indexes() */
 	 * required for piom_ltask_set_bound_thread_indexes() */
 	_starpu_mpi_do_initialize(argc_argv);
 	_starpu_mpi_do_initialize(argc_argv);
 
 
-	if (_starpu_mpi_thread_cpuid < 0)
+	if (!_starpu_mpi_nobind && _starpu_mpi_thread_cpuid < 0)
 	{
 	{
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	}
 	}
@@ -633,7 +635,8 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 	/* Tell pioman to use a bound thread for communication progression:
 	/* Tell pioman to use a bound thread for communication progression:
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
-	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
+	if (!_starpu_mpi_nobind)
+		piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
 
 
 	/* Register some hooks for communication progress if needed */
 	/* Register some hooks for communication progress if needed */
 	int polling_point_prog, polling_point_idle;
 	int polling_point_prog, polling_point_idle;

+ 1 - 1
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c

@@ -130,7 +130,7 @@ static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, cons
 		int ret = nm_sr_recv_peek(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_size));
 		int ret = nm_sr_recv_peek(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_size));
 		STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "nm_sr_recv_peek returned %d", ret);
 		STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "nm_sr_recv_peek returned %d", ret);
 
 
-		req->ptr = malloc(req->count);
+		req->ptr = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, req->count, 0);
 		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
 		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
 
 
 		nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*) req->ptr, req->datatype, req->count);
 		nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*) req->ptr, req->datatype, req->count);

+ 2 - 0
mpi/src/starpu_mpi_private.c

@@ -22,6 +22,7 @@ int _starpu_debug_level_max=0;
 int _starpu_mpi_tag = 42;
 int _starpu_mpi_tag = 42;
 int _starpu_mpi_comm_debug;
 int _starpu_mpi_comm_debug;
 
 
+int _starpu_mpi_nobind = -1;
 int _starpu_mpi_thread_cpuid = -1;
 int _starpu_mpi_thread_cpuid = -1;
 int _starpu_mpi_use_prio = 1;
 int _starpu_mpi_use_prio = 1;
 int _starpu_mpi_fake_world_size = -1;
 int _starpu_mpi_fake_world_size = -1;
@@ -62,6 +63,7 @@ void _starpu_mpi_env_init(void)
         _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
         _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
 	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
 	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
 	_starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
 	_starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
+	_starpu_mpi_nobind = starpu_get_env_number_default("STARPU_MPI_NOBIND", 0);
 	_starpu_mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
 	_starpu_mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
 	_starpu_mpi_use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
 	_starpu_mpi_use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
 	_starpu_mpi_use_coop_sends = starpu_get_env_number_default("STARPU_MPI_COOP_SENDS", 1);
 	_starpu_mpi_use_coop_sends = starpu_get_env_number_default("STARPU_MPI_COOP_SENDS", 1);

+ 1 - 0
mpi/src/starpu_mpi_private.h

@@ -61,6 +61,7 @@ void _starpu_mpi_set_debug_level_max(int level);
 extern int _starpu_mpi_fake_world_size;
 extern int _starpu_mpi_fake_world_size;
 extern int _starpu_mpi_fake_world_rank;
 extern int _starpu_mpi_fake_world_rank;
 extern int _starpu_mpi_use_prio;
 extern int _starpu_mpi_use_prio;
+extern int _starpu_mpi_nobind;
 extern int _starpu_mpi_thread_cpuid;
 extern int _starpu_mpi_thread_cpuid;
 extern int _starpu_mpi_use_coop_sends;
 extern int _starpu_mpi_use_coop_sends;
 extern int _starpu_mpi_mem_throttle;
 extern int _starpu_mpi_mem_throttle;

+ 2 - 2
mpi/tests/Makefile.am

@@ -45,10 +45,10 @@ endif
 endif
 endif
 
 
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
-TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 else
 else
-TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=3 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 endif
 
 
 if STARPU_MPI_CHECK
 if STARPU_MPI_CHECK

+ 18 - 2
mpi/tests/early_request.c

@@ -109,6 +109,18 @@ void submitted_order_fun(void *buffers[], void *cl_arg)
 	(void)cl_arg;
 	(void)cl_arg;
 }
 }
 
 
+static struct starpu_codelet submitted_order_rw =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {submitted_order_fun, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+#ifdef STARPU_SIMGRID
+	.model = &starpu_perfmodel_nop,
+#endif
+	.name = "submitted_order_enforcer"
+};
+
 static struct starpu_codelet submitted_order =
 static struct starpu_codelet submitted_order =
 {
 {
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
@@ -156,11 +168,15 @@ void insert_work_for_one_element(struct element *el)
 			   STARPU_W,tmp_send,
 			   STARPU_W,tmp_send,
 			   0);
 			   0);
 	//Send operation
 	//Send operation
-	starpu_insert_task(&submitted_order,
+	starpu_insert_task(&submitted_order_rw,
 			   STARPU_RW,el->ensure_submitted_order_send,
 			   STARPU_RW,el->ensure_submitted_order_send,
-			   STARPU_W,tmp_send,
+			   STARPU_RW,tmp_send,
 			   0);
 			   0);
 	starpu_mpi_isend_detached(tmp_send,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);
 	starpu_mpi_isend_detached(tmp_send,el->foreign_domain,el->tag, MPI_COMM_WORLD, NULL, NULL);
+	starpu_insert_task(&submitted_order_rw,
+			   STARPU_RW,el->ensure_submitted_order_send,
+			   STARPU_RW,tmp_send,
+			   0);
 
 
 	//Recv operation for current element
 	//Recv operation for current element
 	starpu_insert_task(&submitted_order,
 	starpu_insert_task(&submitted_order,

+ 2 - 0
src/common/utils.h

@@ -85,8 +85,10 @@
 #define _STARPU_UYIELD() ((void)0)
 #define _STARPU_UYIELD() ((void)0)
 #endif
 #endif
 #if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
 #if defined(STARPU_HAVE_SCHED_YIELD) && defined(STARPU_HAVE_HELGRIND_H)
+#define STARPU_VALGRIND_YIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); } while (0)
 #define STARPU_UYIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
 #define STARPU_UYIELD() do { if (STARPU_RUNNING_ON_VALGRIND) sched_yield(); else _STARPU_UYIELD(); } while (0)
 #else
 #else
+#define STARPU_VALGRIND_YIELD() do { } while (0)
 #define STARPU_UYIELD() _STARPU_UYIELD()
 #define STARPU_UYIELD() _STARPU_UYIELD()
 #endif
 #endif
 
 

+ 3 - 0
src/datawizard/user_interactions.c

@@ -686,6 +686,9 @@ static void _starpu_data_wont_use(void *data)
 
 
 void starpu_data_wont_use(starpu_data_handle_t handle)
 void starpu_data_wont_use(starpu_data_handle_t handle)
 {
 {
+	if (!handle->initialized)
+		/* No value atm actually */
+		return;
 	_STARPU_TRACE_DATA_WONT_USE(handle);
 	_STARPU_TRACE_DATA_WONT_USE(handle);
 	starpu_data_acquire_on_node_cb_sequential_consistency_quick(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle, 1, 1);
 	starpu_data_acquire_on_node_cb_sequential_consistency_quick(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle, 1, 1);
 }
 }

+ 7 - 5
src/debug/traces/starpu_fxt.c

@@ -2671,11 +2671,12 @@ static void handle_job_push(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
                char paje_value[STARPU_POTI_STR_LEN];
                char paje_value[STARPU_POTI_STR_LEN];
                snprintf(paje_value, sizeof(paje_value), "%u", task);
                snprintf(paje_value, sizeof(paje_value), "%u", task);
                snprintf(container, sizeof(container), "%sp", options->file_prefix);
                snprintf(container, sizeof(container), "%sp", options->file_prefix);
-               poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
+		if (!options->no_events)
+			poti_NewEvent(get_event_time_stamp(ev, options), container, "pu", paje_value);
 #else
 #else
-	       fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
-	       if (!options->no_events)
-	       	    fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
+		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
+		if (!options->no_events)
+			fprintf(out_paje_file, "9       %.9f    %s      %sp     %u\n", get_event_time_stamp(ev, options), "pu", options->file_prefix, task);
 #endif
 #endif
 	}
 	}
 
 
@@ -2717,7 +2718,8 @@ static void handle_job_pop(struct fxt_ev_64 *ev, struct starpu_fxt_options *opti
 		char paje_value[STARPU_POTI_STR_LEN];
 		char paje_value[STARPU_POTI_STR_LEN];
 		snprintf(paje_value, sizeof(paje_value), "%u", task);
 		snprintf(paje_value, sizeof(paje_value), "%u", task);
 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
 		snprintf(container, sizeof(container), "%sp", options->file_prefix);
-		poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
+		if (!options->no_events)
+			poti_NewEvent(get_event_time_stamp(ev, options), container, "po", paje_value);
 #else
 #else
 		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
 		fprintf(out_paje_file, "13	%.9f	%ssched	nready	%f\n", current_timestamp, options->file_prefix, (float)curq_size);
 		fprintf(out_paje_file, "13	%.9f	%ssched	nsubmitted	%f\n", current_timestamp, options->file_prefix, (float)nsubmitted);
 		fprintf(out_paje_file, "13	%.9f	%ssched	nsubmitted	%f\n", current_timestamp, options->file_prefix, (float)nsubmitted);

+ 1 - 1
tests/datawizard/acquire_release_to.c

@@ -70,7 +70,7 @@ static struct starpu_codelet check_cl =
 {
 {
 	.modes = { STARPU_R },
 	.modes = { STARPU_R },
 	.cpu_funcs = {check_cpu},
 	.cpu_funcs = {check_cpu},
-	.cpu_funcs_name = {"increment_cpu"},
+	.cpu_funcs_name = {"check_cpu"},
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 

+ 1 - 2
tools/dev/valgrind/libc.suppr

@@ -263,8 +263,7 @@
    Memcheck:Leak
    Memcheck:Leak
    match-leak-kinds: reachable
    match-leak-kinds: reachable
    fun:malloc
    fun:malloc
-   fun:_dl_close_worker
-   fun:_dl_close_worker
+   ...
    fun:_dl_close
    fun:_dl_close
    fun:_dl_catch_exception
    fun:_dl_catch_exception
    fun:_dl_catch_error
    fun:_dl_catch_error

+ 1 - 1
tools/dev/valgrind/padico.suppr

@@ -110,7 +110,7 @@
    Memcheck:Leak
    Memcheck:Leak
    match-leak-kinds: reachable
    match-leak-kinds: reachable
    fun:malloc
    fun:malloc
-   fun:_dl_close_worker
+   ...
    fun:_dl_close
    fun:_dl_close
    fun:_dl_catch_error
    fun:_dl_catch_error
    fun:dlerror_run
    fun:dlerror_run