5 years ago · 42b2e40b60
--- a/ChangeLog
+++ b/ChangeLog
@@ -37,8 +37,8 @@ New features:
 
				     thread.
			
 
				   * New function starpu_get_pu_os_index() to convert logical index of a PU to
			
 
				     its OS index.
			
 
				-  * New function starpu_get_hwloc_topology() to get a copy of the hwloc
			
 
				-    topology used by StarPU.
			
 
				+  * New function starpu_get_hwloc_topology() to get the hwloc topology used by
			
 
				+    StarPU.
			
 
				   * Add a task prefetch level, to improve retaining data in accelerators so we
			
 
				     can make prefetch more aggressive.
			
 
				 
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -225,6 +225,7 @@ STARPU_EXAMPLES +=				\
 
				 	cpp/incrementer_cpp			\
			
 
				 	cpp/add_vectors				\
			
 
				 	cpp/add_vectors_interface		\
			
 
				+	filters/fread				\
			
 
				 	filters/fvector				\
			
 
				 	filters/fblock				\
			
 
				 	filters/fmatrix				\
			
--- a/examples/filters/fread.c
+++ b/examples/filters/fread.c
@@ -0,0 +1,146 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define NX    20
			
 
				+#define PARTS 2
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+void display_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+        unsigned i;
			
 
				+
			
 
				+        /* length of the vector */
			
 
				+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+        /* local copy of the vector pointer */
			
 
				+        int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	FPRINTF(stderr, "vector with n=%u : ", n);
			
 
				+        for (i = 0; i < n; i++)
			
 
				+		FPRINTF(stderr, "%5d ", val[i]);
			
 
				+	FPRINTF(stderr, "\n");
			
 
				+}
			
 
				+
			
 
				+void cpu_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+        unsigned i;
			
 
				+
			
 
				+        /* length of the vector */
			
 
				+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+        /* local copy of the vector pointer */
			
 
				+        int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	FPRINTF(stderr, "computing on vector with n=%u\n", n);
			
 
				+        for (i = 0; i < n; i++)
			
 
				+                val[i] *= 2;
			
 
				+}
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	int i;
			
 
				+        int vector[NX];
			
 
				+        starpu_data_handle_t handle;
			
 
				+	starpu_data_handle_t subhandles[PARTS];
			
 
				+	int ret;
			
 
				+
			
 
				+        struct starpu_codelet cl =
			
 
				+	{
			
 
				+                .cpu_funcs = {cpu_func},
			
 
				+                .cpu_funcs_name = {"cpu_func"},
			
 
				+                .nbuffers = 1,
			
 
				+		.modes = {STARPU_RW},
			
 
				+		.name = "vector_scal"
			
 
				+        };
			
 
				+        struct starpu_codelet print_cl =
			
 
				+	{
			
 
				+                .cpu_funcs = {display_func},
			
 
				+                .cpu_funcs_name = {"display_func"},
			
 
				+                .nbuffers = 1,
			
 
				+		.modes = {STARPU_R},
			
 
				+		.name = "vector_display"
			
 
				+        };
			
 
				+
			
 
				+        for(i=0 ; i<NX ; i++) vector[i] = i;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		exit(77);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	/* Declare data to StarPU */
			
 
				+	starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				+
			
 
				+        /* Partition the vector in PARTS sub-vectors */
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				+		.filter_func = starpu_vector_filter_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(handle, &f, subhandles);
			
 
				+
			
 
				+	ret = starpu_task_insert(&print_cl,
			
 
				+				 STARPU_R, handle,
			
 
				+				 0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+        /* Submit a task on each sub-vector */
			
 
				+	for (i=0; i<PARTS; i++)
			
 
				+	{
			
 
				+		ret = starpu_task_insert(&cl,
			
 
				+					 STARPU_RW, subhandles[i],
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	/* Submit a read on the whole vector */
			
 
				+	ret = starpu_task_insert(&print_cl,
			
 
				+				 STARPU_R, handle,
			
 
				+				 0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+        /* Submit a read on each sub-vector */
			
 
				+	for (i=0; i<PARTS; i++)
			
 
				+	{
			
 
				+		ret = starpu_task_insert(&print_cl,
			
 
				+					 STARPU_R, subhandles[i],
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	/* Submit a read on the whole vector */
			
 
				+	ret = starpu_task_insert(&print_cl,
			
 
				+				 STARPU_R, handle,
			
 
				+				 0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	starpu_data_partition_clean(handle, PARTS, subhandles);
			
 
				+        starpu_data_unregister(handle);
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+enodev:
			
 
				+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				+}
			
--- a/include/starpu_helper.h
+++ b/include/starpu_helper.h
@@ -202,9 +202,10 @@ int starpu_get_pu_os_index(unsigned logical_index);
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 /**
			
 
				-   Get a copy of the hwloc topology used by StarPU.
			
 
				+   Get the hwloc topology used by StarPU. One can use this pointer to get
			
 
				+   information about topology, but not to change settings related to topology.
			
 
				 */
			
 
				-int starpu_get_hwloc_topology(hwloc_topology_t* topology);
			
 
				+hwloc_topology_t starpu_get_hwloc_topology(void);
			
 
				 #endif
			
 
				 /** @} */
			
 
				 
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -336,7 +336,8 @@ starpu_mpi_EXAMPLES	+=			\
 
				 
			
 
				 examplebin_PROGRAMS +=				\
			
 
				 	user_datatype/user_datatype		\
			
 
				-	user_datatype/user_datatype2
			
 
				+	user_datatype/user_datatype2		\
			
 
				+	user_datatype/user_datatype_early
			
 
				 
			
 
				 user_datatype_user_datatype_SOURCES =		\
			
 
				 	user_datatype/user_datatype.c		\
			
@@ -346,9 +347,14 @@ user_datatype_user_datatype2_SOURCES =		\
 
				 	user_datatype/user_datatype2.c		\
			
 
				 	user_datatype/my_interface.c
			
 
				 
			
 
				+user_datatype_user_datatype_early_SOURCES =	\
			
 
				+	user_datatype/user_datatype_early.c	\
			
 
				+	user_datatype/my_interface.c
			
 
				+
			
 
				 if !STARPU_SIMGRID
			
 
				 starpu_mpi_EXAMPLES	+=			\
			
 
				 	user_datatype/user_datatype2		\
			
 
				+	user_datatype/user_datatype_early	\
			
 
				 	user_datatype/user_datatype
			
 
				 endif
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky.c
@@ -58,7 +58,7 @@ int main(int argc, char **argv)
 
				 #ifndef STARPU_SIMGRID
			
 
				 	matrix_display(bmat, rank);
			
 
				 
			
 
				-	if (check)
			
 
				+	if (check && rank == 0)
			
 
				 		dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops, 0.001);
			
 
				 #endif
			
 
				 
			
@@ -68,7 +68,7 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (check)
			
 
				+	if (check && rank == 0)
			
 
				 		assert(correctness);
			
 
				 #endif
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -68,6 +68,281 @@ static struct starpu_codelet cl22 =
 
				 	.color = 0x00ff00,
			
 
				 };
			
 
				 
			
 
				+static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int nodes)
			
 
				+{
			
 
				+	unsigned k, m, n;
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		starpu_iteration_push(k);
			
 
				+
			
 
				+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				+				       STARPU_RW, data_handles[k][k],
			
 
				+				       0);
			
 
				+
			
 
				+		for (m = k+1; m<nblocks; m++)
			
 
				+		{
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+					       STARPU_R, data_handles[k][k],
			
 
				+					       STARPU_RW, data_handles[m][k],
			
 
				+					       0);
			
 
				+
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
			
 
				+			if (my_distrib(k, k, nodes) == rank)
			
 
				+				starpu_data_wont_use(data_handles[k][k]);
			
 
				+
			
 
				+			for (n = k+1; n<nblocks; n++)
			
 
				+			{
			
 
				+				if (n <= m)
			
 
				+				{
			
 
				+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+							       STARPU_R, data_handles[n][k],
			
 
				+							       STARPU_R, data_handles[m][k],
			
 
				+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+							       0);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
			
 
				+			if (my_distrib(m, k, nodes) == rank)
			
 
				+				starpu_data_wont_use(data_handles[m][k]);
			
 
				+		}
			
 
				+		starpu_iteration_pop();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* TODO: generate from compiler polyhedral analysis of classical algorithm */
			
 
				+static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
			
 
				+{
			
 
				+	unsigned k, m, n;
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				+	/* Column */
			
 
				+	for (n = 0; n<nblocks; n++)
			
 
				+	{
			
 
				+		starpu_iteration_push(n);
			
 
				+
			
 
				+		/* Row */
			
 
				+		for (m = n; m<nblocks; m++)
			
 
				+		{
			
 
				+			for (k = 0; k < n; k++)
			
 
				+			{
			
 
				+				/* Accumulate updates from TRSMs */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[n][k],
			
 
				+						       STARPU_R, data_handles[m][k],
			
 
				+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       0);
			
 
				+			}
			
 
				+			k = n;
			
 
				+			if (m > n)
			
 
				+			{
			
 
				+				/* non-diagonal block, solve */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[k][k],
			
 
				+						       STARPU_RW, data_handles[m][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* diagonal block, factorize */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				+						       STARPU_RW, data_handles[k][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		starpu_iteration_pop();
			
 
				+	}
			
 
				+
			
 
				+	/* Submit flushes, StarPU will fit them according to the progress */
			
 
				+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+		for (n = 0; n < nblocks ; n++)
			
 
				+			starpu_data_wont_use(data_handles[m][n]);
			
 
				+}
			
 
				+
			
 
				+/* TODO: generate from compiler polyhedral analysis of classical algorithm */
			
 
				+static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
			
 
				+{
			
 
				+	unsigned a, c;
			
 
				+	unsigned k, m, n;
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				+	/* double-antidiagonal number:
			
 
				+	 * - a=0 contains (0,0) plus (1,0)
			
 
				+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
			
 
				+	 * - etc.
			
 
				+	 */
			
 
				+	for (a = 0; a < nblocks; a++)
			
 
				+	{
			
 
				+		starpu_iteration_push(a);
			
 
				+
			
 
				+		unsigned nfirst;
			
 
				+		if (2*a < nblocks)
			
 
				+			nfirst = 0;
			
 
				+		else
			
 
				+			nfirst = 2*a - (nblocks-1);
			
 
				+
			
 
				+		/* column within first antidiagonal for a */
			
 
				+		for (n = nfirst; n <= a; n++)
			
 
				+		{
			
 
				+			/* row */
			
 
				+			m = 2*a-n;
			
 
				+
			
 
				+			/* Accumulate updates from TRSMs */
			
 
				+			for (k = 0; k < n; k++)
			
 
				+			{
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[n][k],
			
 
				+						       STARPU_R, data_handles[m][k],
			
 
				+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       0);
			
 
				+			}
			
 
				+
			
 
				+			/* k = n */
			
 
				+			if (n < a)
			
 
				+			{
			
 
				+				/* non-diagonal block, solve */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[k][k],
			
 
				+						       STARPU_RW, data_handles[m][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* diagonal block, factorize */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				+						       STARPU_RW, data_handles[k][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/* column within second antidiagonal for a */
			
 
				+		for (n = nfirst; n <= a; n++)
			
 
				+		{
			
 
				+			/* row */
			
 
				+			m = 2*a-n + 1;
			
 
				+
			
 
				+			if (m >= nblocks)
			
 
				+				/* Skip first item when even number of tiles */
			
 
				+				continue;
			
 
				+
			
 
				+			/* Accumulate updates from TRSMs */
			
 
				+			for (k = 0; k < n; k++)
			
 
				+			{
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[n][k],
			
 
				+						       STARPU_R, data_handles[m][k],
			
 
				+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       0);
			
 
				+			}
			
 
				+			/* non-diagonal block, solve */
			
 
				+			k = n;
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+					       STARPU_R, data_handles[k][k],
			
 
				+					       STARPU_RW, data_handles[m][k],
			
 
				+					       0);
			
 
				+		}
			
 
				+
			
 
				+		starpu_iteration_pop();
			
 
				+	}
			
 
				+
			
 
				+	/* Submit flushes, StarPU will fit them according to the progress */
			
 
				+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+		for (n = 0; n < nblocks ; n++)
			
 
				+			starpu_data_wont_use(data_handles[m][n]);
			
 
				+}
			
 
				+
			
 
				+/* TODO: generate from compiler polyhedral analysis of classical algorithm */
			
 
				+static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int nodes)
			
 
				+{
			
 
				+	unsigned a;
			
 
				+	int k, m, n;
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				+	/*
			
 
				+	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
			
 
				+	 * double-antidiagonal number:
			
 
				+	 * - a=0 contains (0,0) plus (1,0)
			
 
				+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
			
 
				+	 * - etc.
			
 
				+	 */
			
 
				+	for (a = 0; a < 4*nblocks; a++)
			
 
				+	{
			
 
				+		starpu_iteration_push(a);
			
 
				+
			
 
				+		for (k = 0; k < nblocks; k++)
			
 
				+		{
			
 
				+			n = k;
			
 
				+			/* Should be m = a-k-n; for potrf and trsm to respect
			
 
				+			   priorities, but needs to be this for dependencies */
			
 
				+			m = a-2*k-n;
			
 
				+
			
 
				+			if (m < 0 || m >= nblocks)
			
 
				+				continue;
			
 
				+
			
 
				+			if (m == n)
			
 
				+			{
			
 
				+				/* diagonal block, factorize */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				+						       STARPU_RW, data_handles[k][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* non-diagonal block, solve */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[k][k],
			
 
				+						       STARPU_RW, data_handles[m][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+
			
 
				+			/* column within antidiagonal for a */
			
 
				+			for (n = k + 1; n < nblocks; n++)
			
 
				+			{
			
 
				+				/* row */
			
 
				+				m = a-2*k-n;
			
 
				+
			
 
				+				if (m >= n && m < nblocks)
			
 
				+				{
			
 
				+					/* Update */
			
 
				+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+							       STARPU_R, data_handles[n][k],
			
 
				+							       STARPU_R, data_handles[m][k],
			
 
				+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+							       0);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+		}
			
 
				+
			
 
				+		starpu_iteration_pop();
			
 
				+	}
			
 
				+
			
 
				+	/* Submit flushes, StarPU will fit them according to the progress */
			
 
				+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+		for (n = 0; n < nblocks ; n++)
			
 
				+			starpu_data_wont_use(data_handles[m][n]);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  *	code to bootstrap the factorization
			
 
				  *	and construct the DAG
			
@@ -79,8 +354,6 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 	starpu_data_handle_t **data_handles;
			
 
				 	unsigned k, m, n;
			
 
				 
			
 
				-	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				-
			
 
				 	/* create all the DAG nodes */
			
 
				 
			
 
				 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
			
@@ -91,7 +364,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		for(n = 0; n < nblocks ; n++)
			
 
				 		{
			
 
				 			int mpi_rank = my_distrib(m, n, nodes);
			
 
				-			if (mpi_rank == rank)
			
 
				+			if (mpi_rank == rank || (check && rank == 0))
			
 
				 			{
			
 
				 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
			
 
				 				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
			
@@ -119,50 +392,17 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	start = starpu_timing_now();
			
 
				 
			
 
				-	for (k = 0; k < nblocks; k++)
			
 
				+	switch (submission)
			
 
				 	{
			
 
				-		starpu_iteration_push(k);
			
 
				-
			
 
				-		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				-				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				-				       STARPU_RW, data_handles[k][k],
			
 
				-				       0);
			
 
				-
			
 
				-		for (m = k+1; m<nblocks; m++)
			
 
				-		{
			
 
				-			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				-					       STARPU_R, data_handles[k][k],
			
 
				-					       STARPU_RW, data_handles[m][k],
			
 
				-					       0);
			
 
				-
			
 
				-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
			
 
				-			if (my_distrib(k, k, nodes) == rank)
			
 
				-				starpu_data_wont_use(data_handles[k][k]);
			
 
				-
			
 
				-			for (n = k+1; n<nblocks; n++)
			
 
				-			{
			
 
				-				if (n <= m)
			
 
				-				{
			
 
				-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				-							       STARPU_R, data_handles[n][k],
			
 
				-							       STARPU_R, data_handles[m][k],
			
 
				-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				-							       0);
			
 
				-				}
			
 
				-			}
			
 
				-
			
 
				-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
			
 
				-			if (my_distrib(m, k, nodes) == rank)
			
 
				-				starpu_data_wont_use(data_handles[m][k]);
			
 
				-		}
			
 
				-		starpu_iteration_pop();
			
 
				+		case TRIANGLES:		run_cholesky(data_handles, rank, nodes); break;
			
 
				+		case COLUMNS:		run_cholesky_column(data_handles, rank, nodes); break;
			
 
				+		case ANTIDIAGONALS:	run_cholesky_antidiagonal(data_handles, rank, nodes); break;
			
 
				+		case PRIOS:		run_cholesky_prio(data_handles, rank, nodes); break;
			
 
				+		default: STARPU_ABORT();
			
 
				 	}
			
 
				 
			
 
				 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	for (m = 0; m < nblocks; m++)
			
@@ -170,7 +410,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		for(n = 0; n < nblocks ; n++)
			
 
				 		{
			
 
				 			/* Get back data on node 0 for the check */
			
 
				-			if (check)
			
 
				+			if (check && data_handles[m][n])
			
 
				 				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
			
 
				 
			
 
				 			if (data_handles[m][n])
			
@@ -248,24 +488,20 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 	{
			
 
				 		for (m = 0; m < nblocks; m++)
			
 
				 		{
			
 
				-			int mpi_rank = my_distrib(m, n, nodes);
			
 
				-			if (mpi_rank == rank)
			
 
				+			for (nn = BLOCKSIZE*n ; nn < BLOCKSIZE*(n+1); nn++)
			
 
				 			{
			
 
				-				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
			
 
				+				for (mm = BLOCKSIZE*m ; mm < BLOCKSIZE*(m+1); mm++)
			
 
				 				{
			
 
				-					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
			
 
				+					if (nn <= mm)
			
 
				 					{
			
 
				-						if (nn <= mm)
			
 
				+						float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
			
 
				+						float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
			
 
				+						if (err > epsilon)
			
 
				 						{
			
 
				-							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
			
 
				-							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
			
 
				-							if (err > epsilon)
			
 
				-							{
			
 
				-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
			
 
				-								*correctness = 0;
			
 
				-								*flops = 0;
			
 
				-								break;
			
 
				-							}
			
 
				+							FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
			
 
				+							*correctness = 0;
			
 
				+							*flops = 0;
			
 
				+							break;
			
 
				 						}
			
 
				 					}
			
 
				 				}
			
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -43,6 +43,7 @@ unsigned check = 0;
 
				 unsigned display = 0;
			
 
				 int dblockx = -1;
			
 
				 int dblocky = -1;
			
 
				+enum submission submission = TRIANGLES;
			
 
				 
			
 
				 void parse_args(int argc, char **argv, int nodes)
			
 
				 {
			
@@ -79,6 +80,21 @@ void parse_args(int argc, char **argv, int nodes)
 
				                         nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				                 }
			
 
				 
			
 
				+                if (strcmp(argv[i], "-columns") == 0)
			
 
				+                {
			
 
				+                        submission = COLUMNS;
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-antidiagonals") == 0)
			
 
				+                {
			
 
				+                        submission = ANTIDIAGONALS;
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-prios") == 0)
			
 
				+                {
			
 
				+                        submission = PRIOS;
			
 
				+                }
			
 
				+
			
 
				                 if (strcmp(argv[i], "-no-prio") == 0)
			
 
				                 {
			
 
				                         noprio = 1;
			
@@ -96,7 +112,7 @@ void parse_args(int argc, char **argv, int nodes)
 
				 
			
 
				                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				                 {
			
 
				-			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display] [-check]\n", argv[0]);
			
 
				+                        printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-prios] [-no-prio] [-display] [-check]\n", argv[0]);
			
 
				                 }
			
 
				         }
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.h
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.h
@@ -28,6 +28,15 @@ extern unsigned display;
 
				 extern int dblockx;
			
 
				 extern int dblocky;
			
 
				 
			
 
				+enum submission
			
 
				+{
			
 
				+	TRIANGLES,
			
 
				+	COLUMNS,
			
 
				+	ANTIDIAGONALS,
			
 
				+	PRIOS,
			
 
				+};
			
 
				+extern enum submission submission;
			
 
				+
			
 
				 void parse_args(int argc, char **argv, int nodes);
			
 
				 
			
 
				 #endif // __MPI_CHOLESKY_PARAMS_H__
			
--- a/mpi/examples/user_datatype/my_interface.c
+++ b/mpi/examples/user_datatype/my_interface.c
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				 
			
 
				 #include "my_interface.h"
			
 
				 
			
@@ -314,6 +315,7 @@ void starpu_my_data_register(starpu_data_handle_t *handleptr, unsigned home_node
 
				 	if (interface_data_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
			
 
				 	{
			
 
				 		interface_data_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				+		starpu_mpi_interface_datatype_register(interface_data_ops.interfaceid, starpu_my_data_datatype_allocate, starpu_my_data_datatype_free);
			
 
				 	}
			
 
				 
			
 
				 	struct starpu_my_data_interface data =
			
@@ -327,6 +329,12 @@ void starpu_my_data_register(starpu_data_handle_t *handleptr, unsigned home_node
 
				 	starpu_data_register(handleptr, home_node, &data, &interface_data_ops);
			
 
				 }
			
 
				 
			
 
				+void starpu_my_data_shutdown(void)
			
 
				+{
			
 
				+	starpu_mpi_interface_datatype_unregister(interface_data_ops.interfaceid);
			
 
				+
			
 
				+}
			
 
				+
			
 
				 static struct starpu_data_interface_ops interface_data2_ops =
			
 
				 {
			
 
				 	.register_data_handle = data_register_data_handle,
			
@@ -349,6 +357,7 @@ void starpu_my_data2_register(starpu_data_handle_t *handleptr, unsigned home_nod
 
				 	if (interface_data2_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
			
 
				 	{
			
 
				 		interface_data2_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				+		starpu_mpi_interface_datatype_register(interface_data2_ops.interfaceid, starpu_my_data2_datatype_allocate, starpu_my_data2_datatype_free);
			
 
				 	}
			
 
				 
			
 
				 	struct starpu_my_data_interface data =
			
@@ -361,3 +370,9 @@ void starpu_my_data2_register(starpu_data_handle_t *handleptr, unsigned home_nod
 
				 
			
 
				 	starpu_data_register(handleptr, home_node, &data, &interface_data2_ops);
			
 
				 }
			
 
				+
			
 
				+void starpu_my_data2_shutdown(void)
			
 
				+{
			
 
				+	starpu_mpi_interface_datatype_unregister(interface_data2_ops.interfaceid);
			
 
				+
			
 
				+}
			
--- a/mpi/examples/user_datatype/my_interface.h
+++ b/mpi/examples/user_datatype/my_interface.h
@@ -76,4 +76,7 @@ static struct starpu_codelet starpu_my_data_compare_codelet =
 
				 	.name = "starpu_my_data_compare_codelet"
			
 
				 };
			
 
				 
			
 
				+void starpu_my_data_shutdown(void);
			
 
				+void starpu_my_data2_shutdown(void);
			
 
				+
			
 
				 #endif /* __MY_INTERFACE_H */
			
--- a/mpi/examples/user_datatype/user_datatype.c
+++ b/mpi/examples/user_datatype/user_datatype.c
@@ -57,9 +57,6 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_my_data_register(&handle0, STARPU_MAIN_RAM, &my0);
			
 
				 	starpu_my_data_register(&handle1, -1, &my1);
			
 
				-	starpu_mpi_datatype_register(handle1, starpu_my_data_datatype_allocate, starpu_my_data_datatype_free);
			
 
				-
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
 
				 	// Send data directly with MPI
			
 
				 	if (rank == 0)
			
@@ -123,10 +120,10 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
 
				-	starpu_mpi_datatype_unregister(handle0);
			
 
				 	starpu_data_unregister(handle0);
			
 
				 	starpu_data_unregister(handle1);
			
 
				 
			
 
				+	starpu_my_data_shutdown();
			
 
				 	starpu_mpi_shutdown();
			
 
				 
			
 
				 	if (rank == 0)
			
--- a/mpi/examples/user_datatype/user_datatype2.c
+++ b/mpi/examples/user_datatype/user_datatype2.c
@@ -57,7 +57,6 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_my_data2_register(&handle0, STARPU_MAIN_RAM, &my0);
			
 
				 	starpu_my_data2_register(&handle1, -1, &my1);
			
 
				-	starpu_mpi_datatype_register(handle1, starpu_my_data2_datatype_allocate, starpu_my_data2_datatype_free);
			
 
				 
			
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
@@ -87,6 +86,7 @@ int main(int argc, char **argv)
 
				 	starpu_data_unregister(handle0);
			
 
				 	starpu_data_unregister(handle1);
			
 
				 
			
 
				+	starpu_my_data2_shutdown();
			
 
				 	starpu_mpi_shutdown();
			
 
				 
			
 
				 	if (rank == 0)
			
--- a/mpi/examples/user_datatype/user_datatype_early.c
+++ b/mpi/examples/user_datatype/user_datatype_early.c
@@ -0,0 +1,92 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "my_interface.h"
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, nodes;
			
 
				+	int ret=0;
			
 
				+
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+
			
 
				+	if (nodes < 2 || (starpu_cpu_worker_get_count() == 0))
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			if (nodes < 2)
			
 
				+				fprintf(stderr, "We need at least 2 processes.\n");
			
 
				+			else
			
 
				+				fprintf(stderr, "We need at least 1 CPU.\n");
			
 
				+		}
			
 
				+		starpu_mpi_shutdown();
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	struct starpu_my_data my0 = {.d = 42 , .c = 'n'};
			
 
				+	struct starpu_my_data my1 = {.d = 11 , .c = 'a'};
			
 
				+
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+		my0.d *= 2;
			
 
				+		my0.c += 1;
			
 
				+		my1.d *= 2;
			
 
				+		my1.c += 1;
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_handle_t handle0;
			
 
				+	starpu_data_handle_t handle1;
			
 
				+	starpu_my_data_register(&handle0, STARPU_MAIN_RAM, &my0);
			
 
				+	starpu_my_data_register(&handle1, STARPU_MAIN_RAM, &my1);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		starpu_mpi_send(handle0, 1, 10, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_send(handle1, 1, 20, MPI_COMM_WORLD);
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		// We want handle0 to be received as early_data and as starpu_mpi_data_register() has not be called, it will be received as raw memory, and then unpacked with MPI_Unpack()
			
 
				+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 handle0 init value", strlen("node1 handle0 init value")+1, STARPU_R, handle0, 0);
			
 
				+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 handle1 init value", strlen("node1 handle1 init value")+1, STARPU_R, handle1, 0);
			
 
				+		starpu_mpi_recv(handle1, 0, 20, MPI_COMM_WORLD, NULL);
			
 
				+		starpu_mpi_recv(handle0, 0, 10, MPI_COMM_WORLD, NULL);
			
 
				+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 handle0 received value", strlen("node1 handle0 received value")+1, STARPU_R, handle0, 0);
			
 
				+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 handle1 received value", strlen("node1 handle1 received value")+1, STARPU_R, handle1, 0);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	starpu_data_unregister(handle0);
			
 
				+	starpu_data_unregister(handle1);
			
 
				+
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(my0.d == 42 && my0.c == 'n' && my1.d == 11 && my1.c == 'a', "Incorrect received values");
			
 
				+	}
			
 
				+
			
 
				+	starpu_my_data_shutdown();
			
 
				+	starpu_mpi_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -353,6 +353,7 @@ typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
 
				 /**
			
 
				    Register functions to create and free a MPI datatype for the given
			
 
				    handle.
			
 
				+   Similar to starpu_mpi_interface_datatype_register().
			
 
				    It is important that the function is called before any
			
 
				    communication can take place for a data with the given handle. See
			
 
				    \ref ExchangingUserDefinedDataInterface for an example.
			
--- a/mpi/src/mpi/starpu_mpi_mpi.c
+++ b/mpi/src/mpi/starpu_mpi_mpi.c
@@ -918,9 +918,21 @@ static void _starpu_mpi_early_data_cb(void* arg)
 
				 		/* Data has been received as a raw memory, it has to be unpacked */
			
 
				 		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->early_handle);
			
 
				 		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
			
 
				-		STARPU_MPI_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
			
 
				-		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->early_handle));
			
 
				-		args->buffer = NULL;
			
 
				+		MPI_Datatype datatype = _starpu_mpi_datatype_get_user_defined_datatype(args->data_handle);
			
 
				+
			
 
				+		if (datatype)
			
 
				+		{
			
 
				+			int position=0;
			
 
				+			void *ptr = starpu_data_get_local_ptr(args->data_handle);
			
 
				+			MPI_Unpack(args->buffer, itf_src->get_size(args->early_handle), &position, ptr, 1, datatype, args->req->node_tag.node.comm);
			
 
				+			_starpu_mpi_datatype_free(args->data_handle, &datatype);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			STARPU_MPI_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
			
 
				+			itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->early_handle));
			
 
				+			args->buffer = NULL;
			
 
				+		}
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
--- a/mpi/src/starpu_mpi_datatype.c
+++ b/mpi/src/starpu_mpi_datatype.c
@@ -208,6 +208,27 @@ static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_I
 
				 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
			
 
				 };
			
 
				 
			
 
				+MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
			
 
				+	if (id < STARPU_MAX_INTERFACE_ID) return 0;
			
 
				+
			
 
				+	struct _starpu_mpi_datatype_funcs *table;
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_datatype_funcs_table_mutex);
			
 
				+	HASH_FIND_INT(_starpu_mpi_datatype_funcs_table, &id, table);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_datatype_funcs_table_mutex);
			
 
				+	if (table && table->allocate_datatype_func)
			
 
				+	{
			
 
				+		MPI_Datatype datatype;
			
 
				+		int ret = table->allocate_datatype_func(data_handle, &datatype);
			
 
				+		if (ret == 0)
			
 
				+			return datatype;
			
 
				+		else
			
 
				+			return 0;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req)
			
 
				 {
			
 
				 	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
			
--- a/mpi/src/starpu_mpi_datatype.h
+++ b/mpi/src/starpu_mpi_datatype.h
@@ -31,6 +31,8 @@ void _starpu_mpi_datatype_shutdown(void);
 
				 void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req);
			
 
				 void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
			
 
				 
			
 
				+MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t data_handle);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -137,13 +137,13 @@ starpu_mpi_TESTS +=				\
 
				 	ring_sync_detached			\
			
 
				 	temporary				\
			
 
				 	user_defined_datatype			\
			
 
				-	early_stuff				\
			
 
				-	display_bindings
			
 
				+	early_stuff
			
 
				 
			
 
				 if !STARPU_SIMGRID
			
 
				 # missing support in simgrid
			
 
				 starpu_mpi_TESTS +=				\
			
 
				 	attr					\
			
 
				+	display_bindings			\
			
 
				 	mpi_earlyrecv2				\
			
 
				 	mpi_earlyrecv2_sync			\
			
 
				 	block_interface				\
			
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -528,7 +528,7 @@ void _starpu_gethostname(char *hostname, size_t size)
 
				 
			
 
				 	if (force_mpi_hostnames && force_mpi_hostnames[0])
			
 
				 	{
			
 
				-		char *host, *srv_hosts;
			
 
				+		char *host=NULL, *srv_hosts;
			
 
				 		srv_hosts = strdup(force_mpi_hostnames);
			
 
				 		int rank;
			
 
				 		if (starpu_mpi_world_rank)
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -318,9 +318,11 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
			
 
				 	{
			
 
				 		cures = cudaDeviceCanAccessPeer(&can, src, dst);
			
 
				+		(void) cudaGetLastError();
			
 
				 		if (!cures && can)
			
 
				 		{
			
 
				 			cures = cudaDeviceEnablePeerAccess(dst, 0);
			
 
				+			(void) cudaGetLastError();
			
 
				 			if (!cures)
			
 
				 			{
			
 
				 				_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
			
@@ -344,9 +346,11 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
			
 
				 	{
			
 
				 		cures = cudaDeviceCanAccessPeer(&can, dst, src);
			
 
				+		(void) cudaGetLastError();
			
 
				 		if (!cures && can)
			
 
				 		{
			
 
				 			cures = cudaDeviceEnablePeerAccess(src, 0);
			
 
				+			(void) cudaGetLastError();
			
 
				 			if (!cures)
			
 
				 			{
			
 
				 				_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -3072,10 +3072,10 @@ int starpu_get_pu_os_index(unsigned logical_index)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				-int starpu_get_hwloc_topology(hwloc_topology_t* topology)
			
 
				+hwloc_topology_t starpu_get_hwloc_topology(void)
			
 
				 {
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				-	return hwloc_topology_dup(topology, config->topology.hwtopology);
			
 
				+	return config->topology.hwtopology;
			
 
				 }
			
 
				 #endif
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -336,9 +336,13 @@ static void init_device_context(unsigned devid, unsigned memnode)
 
				 			{
			
 
				 				int can;
			
 
				 				cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
			
 
				+				(void) cudaGetLastError();
			
 
				+
			
 
				 				if (!cures && can)
			
 
				 				{
			
 
				 					cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
			
 
				+					(void) cudaGetLastError();
			
 
				+
			
 
				 					if (!cures)
			
 
				 					{
			
 
				 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
			
@@ -883,27 +887,33 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 			_starpu_set_local_worker_key(worker);
			
 
				 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
			
 
				 			/* See next task if any */
			
 
				-			if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
			
 
				+			if (worker->ntasks)
			
 
				 			{
			
 
				-				task = worker->current_tasks[worker->first_task];
			
 
				-				j = _starpu_get_job_associated_to_task(task);
			
 
				-				if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				+				if (worker->current_tasks[worker->first_task] != worker->task_transferring)
			
 
				 				{
			
 
				-					/* An asynchronous task, it was already
			
 
				-					 * queued, it's now running, record its start time.  */
			
 
				-					_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
			
 
				+					task = worker->current_tasks[worker->first_task];
			
 
				+					j = _starpu_get_job_associated_to_task(task);
			
 
				+					if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				+					{
			
 
				+						/* An asynchronous task, it was already
			
 
				+						 * queued, it's now running, record its start time.  */
			
 
				+						_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
			
 
				+					}
			
 
				+					else
			
 
				+					{
			
 
				+						/* A synchronous task, we have finished
			
 
				+						 * flushing the pipeline, we can now at
			
 
				+						 * last execute it.  */
			
 
				+
			
 
				+						_STARPU_TRACE_EVENT("sync_task");
			
 
				+						execute_job_on_cuda(task, worker);
			
 
				+						_STARPU_TRACE_EVENT("end_sync_task");
			
 
				+						worker->pipeline_stuck = 0;
			
 
				+					}
			
 
				 				}
			
 
				 				else
			
 
				-				{
			
 
				-					/* A synchronous task, we have finished
			
 
				-					 * flushing the pipeline, we can now at
			
 
				-					 * last execute it.  */
			
 
				-
			
 
				-					_STARPU_TRACE_EVENT("sync_task");
			
 
				-					execute_job_on_cuda(task, worker);
			
 
				-					_STARPU_TRACE_EVENT("end_sync_task");
			
 
				-					worker->pipeline_stuck = 0;
			
 
				-				}
			
 
				+					/* Data for next task didn't have time to finish transferring :/ */
			
 
				+					_STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
			
 
				 			}
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 			int k;
			
@@ -1164,6 +1174,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
				 		{
			
 
				 			cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
			
 
				 		}
			
 
				+		(void) cudaGetLastError();
			
 
				 		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
			
 
				 	}
			
 
				 
			
@@ -1183,6 +1194,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
				 		{
			
 
				 			cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
			
 
				 		}
			
 
				+		(void) cudaGetLastError();
			
 
				 
			
 
				 		if (!cures)
			
 
				 			cures = cudaDeviceSynchronize();
			
@@ -1240,12 +1252,14 @@ starpu_cuda_copy2d_async_sync(void *src_ptr, unsigned src_node,
 
				 			double start;
			
 
				 			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				 			cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				+			(void) cudaGetLastError();
			
 
				 		}
			
 
				 
			
 
				 		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
			
 
				 		if (stream == NULL || cures)
			
 
				 		{
			
 
				 			cures = cudaMemcpy3DPeer(&p);
			
 
				+			(void) cudaGetLastError();
			
 
				 
			
 
				 			if (!cures)
			
 
				 				cures = cudaDeviceSynchronize();
			
@@ -1337,6 +1351,7 @@ starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node,
 
				 		if (stream == NULL || cures)
			
 
				 		{
			
 
				 			cures = cudaMemcpy3DPeer(&p);
			
 
				+			(void) cudaGetLastError();
			
 
				 
			
 
				 			if (!cures)
			
 
				 				cures = cudaDeviceSynchronize();
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -148,7 +148,6 @@ myPROGRAMS =
 
				 myPROGRAMS +=					\
			
 
				 	main/bind				\
			
 
				 	main/mkdtemp				\
			
 
				-	main/display_binding			\
			
 
				 	main/execute_schedule			\
			
 
				 	main/insert_task_pack			\
			
 
				 	main/insert_task_nullcodelet		\
			
@@ -225,6 +224,7 @@ myPROGRAMS +=				\
 
				 	main/driver_api/init_run_deinit         \
			
 
				 	main/driver_api/run_driver              \
			
 
				 	main/deploop                            \
			
 
				+	main/display_binding			\
			
 
				 	main/execute_on_a_specific_worker	\
			
 
				 	main/insert_task			\
			
 
				 	main/insert_task_value			\
			
@@ -404,6 +404,9 @@ endif
 
				 endif
			
 
				 
			
 
				 examplebin_PROGRAMS = \
			
 
				+	microbenchs/async_tasks_overhead	\
			
 
				+	microbenchs/sync_tasks_overhead		\
			
 
				+	microbenchs/tasks_overhead		\
			
 
				 	microbenchs/tasks_size_overhead		\
			
 
				 	microbenchs/local_pingpong
			
 
				 examplebin_SCRIPTS = \
			
--- a/tests/perfmodels/regression_based.c
+++ b/tests/perfmodels/regression_based.c
@@ -37,7 +37,7 @@ static void memset_cuda(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
			
@@ -53,7 +53,7 @@ void memset0_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	unsigned i;
			
 
				 
			
@@ -66,7 +66,7 @@ void memset_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	starpu_usleep(10);
			
--- a/tests/perfmodels/regression_based_01.c
+++ b/tests/perfmodels/regression_based_01.c
@@ -42,7 +42,7 @@ void memset_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	unsigned i;
			
 
				 
			
--- a/tests/perfmodels/regression_based_02.c
+++ b/tests/perfmodels/regression_based_02.c
@@ -39,7 +39,7 @@ void memset0_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	unsigned i;
			
 
				 
			
@@ -57,7 +57,7 @@ void memset_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	int i;
			
 
				 
			
--- a/tests/perfmodels/regression_based_03.c
+++ b/tests/perfmodels/regression_based_03.c
@@ -40,7 +40,7 @@ void memset0_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	unsigned i;
			
 
				 
			
@@ -58,7 +58,7 @@ void memset_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	int i;
			
 
				 
			
--- a/tests/perfmodels/regression_based_04.c
+++ b/tests/perfmodels/regression_based_04.c
@@ -40,7 +40,7 @@ static void memset_cuda(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
			
@@ -57,7 +57,7 @@ void memset0_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	unsigned i;
			
 
				 
			
@@ -73,7 +73,7 @@ void memset_cpu(void *descr[], void *arg)
 
				 	(void)arg;
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
 
				-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 
			
 
				 	//starpu_usleep(10);