浏览代码

Merge remote-tracking branch 'origin/master' into ft_checkpoint

Romain LION 5 年之前
父节点
当前提交
42b2e40b60

+ 2 - 2
ChangeLog

@@ -37,8 +37,8 @@ New features:
     thread.
   * New function starpu_get_pu_os_index() to convert logical index of a PU to
     its OS index.
-  * New function starpu_get_hwloc_topology() to get a copy of the hwloc
-    topology used by StarPU.
+  * New function starpu_get_hwloc_topology() to get the hwloc topology used by
+    StarPU.
   * Add a task prefetch level, to improve retaining data in accelerators so we
     can make prefetch more aggressive.
 

文件差异内容过多而无法显示
+ 69 - 43
doc/doxygen/chapters/410_mpi_support.doxy


+ 1 - 0
examples/Makefile.am

@@ -225,6 +225,7 @@ STARPU_EXAMPLES +=				\
 	cpp/incrementer_cpp			\
 	cpp/add_vectors				\
 	cpp/add_vectors_interface		\
+	filters/fread				\
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\

+ 146 - 0
examples/filters/fread.c

@@ -0,0 +1,146 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define NX    20
+#define PARTS 2
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+void display_func(void *buffers[], void *cl_arg)
+{
+        unsigned i;
+
+        /* length of the vector */
+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        /* local copy of the vector pointer */
+        int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
+
+	FPRINTF(stderr, "vector with n=%u : ", n);
+        for (i = 0; i < n; i++)
+		FPRINTF(stderr, "%5d ", val[i]);
+	FPRINTF(stderr, "\n");
+}
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+        unsigned i;
+
+        /* length of the vector */
+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        /* local copy of the vector pointer */
+        int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
+
+	FPRINTF(stderr, "computing on vector with n=%u\n", n);
+        for (i = 0; i < n; i++)
+                val[i] *= 2;
+}
+
+int main(void)
+{
+	int i;
+        int vector[NX];
+        starpu_data_handle_t handle;
+	starpu_data_handle_t subhandles[PARTS];
+	int ret;
+
+        struct starpu_codelet cl =
+	{
+                .cpu_funcs = {cpu_func},
+                .cpu_funcs_name = {"cpu_func"},
+                .nbuffers = 1,
+		.modes = {STARPU_RW},
+		.name = "vector_scal"
+        };
+        struct starpu_codelet print_cl =
+	{
+                .cpu_funcs = {display_func},
+                .cpu_funcs_name = {"display_func"},
+                .nbuffers = 1,
+		.modes = {STARPU_R},
+		.name = "vector_display"
+        };
+
+        for(i=0 ; i<NX ; i++) vector[i] = i;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Declare data to StarPU */
+	starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
+
+        /* Partition the vector in PARTS sub-vectors */
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_vector_filter_block,
+		.nchildren = PARTS
+	};
+	starpu_data_partition_plan(handle, &f, subhandles);
+
+	ret = starpu_task_insert(&print_cl,
+				 STARPU_R, handle,
+				 0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+        /* Submit a task on each sub-vector */
+	for (i=0; i<PARTS; i++)
+	{
+		ret = starpu_task_insert(&cl,
+					 STARPU_RW, subhandles[i],
+					 0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	/* Submit a read on the whole vector */
+	ret = starpu_task_insert(&print_cl,
+				 STARPU_R, handle,
+				 0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+        /* Submit a read on each sub-vector */
+	for (i=0; i<PARTS; i++)
+	{
+		ret = starpu_task_insert(&print_cl,
+					 STARPU_R, subhandles[i],
+					 0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	/* Submit a read on the whole vector */
+	ret = starpu_task_insert(&print_cl,
+				 STARPU_R, handle,
+				 0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	starpu_data_partition_clean(handle, PARTS, subhandles);
+        starpu_data_unregister(handle);
+	starpu_shutdown();
+
+	return 0;
+
+enodev:
+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
+	starpu_shutdown();
+	return 77;
+}

+ 3 - 2
include/starpu_helper.h

@@ -202,9 +202,10 @@ int starpu_get_pu_os_index(unsigned logical_index);
 
 #ifdef STARPU_HAVE_HWLOC
 /**
-   Get a copy of the hwloc topology used by StarPU.
+   Get the hwloc topology used by StarPU. One can use this pointer to get
+   information about topology, but not to change settings related to topology.
 */
-int starpu_get_hwloc_topology(hwloc_topology_t* topology);
+hwloc_topology_t starpu_get_hwloc_topology(void);
 #endif
 /** @} */
 

+ 7 - 1
mpi/examples/Makefile.am

@@ -336,7 +336,8 @@ starpu_mpi_EXAMPLES	+=			\
 
 examplebin_PROGRAMS +=				\
 	user_datatype/user_datatype		\
-	user_datatype/user_datatype2
+	user_datatype/user_datatype2		\
+	user_datatype/user_datatype_early
 
 user_datatype_user_datatype_SOURCES =		\
 	user_datatype/user_datatype.c		\
@@ -346,9 +347,14 @@ user_datatype_user_datatype2_SOURCES =		\
 	user_datatype/user_datatype2.c		\
 	user_datatype/my_interface.c
 
+user_datatype_user_datatype_early_SOURCES =	\
+	user_datatype/user_datatype_early.c	\
+	user_datatype/my_interface.c
+
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 	user_datatype/user_datatype2		\
+	user_datatype/user_datatype_early	\
 	user_datatype/user_datatype
 endif
 

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_cholesky.c

@@ -58,7 +58,7 @@ int main(int argc, char **argv)
 #ifndef STARPU_SIMGRID
 	matrix_display(bmat, rank);
 
-	if (check)
+	if (check && rank == 0)
 		dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops, 0.001);
 #endif
 
@@ -68,7 +68,7 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 
 #ifndef STARPU_SIMGRID
-	if (check)
+	if (check && rank == 0)
 		assert(correctness);
 #endif
 

+ 293 - 57
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -68,6 +68,281 @@ static struct starpu_codelet cl22 =
 	.color = 0x00ff00,
 };
 
+static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		starpu_iteration_push(k);
+
+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+				       STARPU_RW, data_handles[k][k],
+				       0);
+
+		for (m = k+1; m<nblocks; m++)
+		{
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[m][k],
+					       0);
+
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
+			if (my_distrib(k, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[k][k]);
+
+			for (n = k+1; n<nblocks; n++)
+			{
+				if (n <= m)
+				{
+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							       STARPU_R, data_handles[n][k],
+							       STARPU_R, data_handles[m][k],
+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+							       0);
+				}
+			}
+
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+			if (my_distrib(m, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[m][k]);
+		}
+		starpu_iteration_pop();
+	}
+}
+
+/* TODO: generate from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/* Column */
+	for (n = 0; n<nblocks; n++)
+	{
+		starpu_iteration_push(n);
+
+		/* Row */
+		for (m = n; m<nblocks; m++)
+		{
+			for (k = 0; k < n; k++)
+			{
+				/* Accumulate updates from TRSMs */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+			k = n;
+			if (m > n)
+			{
+				/* non-diagonal block, solve */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+			else
+			{
+				/* diagonal block, factorize */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
+/* TODO: generate from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned a, c;
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/* double-antidiagonal number:
+	 * - a=0 contains (0,0) plus (1,0)
+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
+	 * - etc.
+	 */
+	for (a = 0; a < nblocks; a++)
+	{
+		starpu_iteration_push(a);
+
+		unsigned nfirst;
+		if (2*a < nblocks)
+			nfirst = 0;
+		else
+			nfirst = 2*a - (nblocks-1);
+
+		/* column within first antidiagonal for a */
+		for (n = nfirst; n <= a; n++)
+		{
+			/* row */
+			m = 2*a-n;
+
+			/* Accumulate updates from TRSMs */
+			for (k = 0; k < n; k++)
+			{
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+
+			/* k = n */
+			if (n < a)
+			{
+				/* non-diagonal block, solve */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+			else
+			{
+				/* diagonal block, factorize */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+		}
+
+		/* column within second antidiagonal for a */
+		for (n = nfirst; n <= a; n++)
+		{
+			/* row */
+			m = 2*a-n + 1;
+
+			if (m >= nblocks)
+				/* Skip first item when even number of tiles */
+				continue;
+
+			/* Accumulate updates from TRSMs */
+			for (k = 0; k < n; k++)
+			{
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+			/* non-diagonal block, solve */
+			k = n;
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[m][k],
+					       0);
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
+/* TODO: generate from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned a;
+	int k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/*
+	 * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
+	 * double-antidiagonal number:
+	 * - a=0 contains (0,0) plus (1,0)
+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
+	 * - etc.
+	 */
+	for (a = 0; a < 4*nblocks; a++)
+	{
+		starpu_iteration_push(a);
+
+		for (k = 0; k < nblocks; k++)
+		{
+			n = k;
+			/* Should be m = a-k-n; for potrf and trsm to respect
+			   priorities, but needs to be this for dependencies */
+			m = a-2*k-n;
+
+			if (m < 0 || m >= nblocks)
+				continue;
+
+			if (m == n)
+			{
+				/* diagonal block, factorize */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+			else
+			{
+				/* non-diagonal block, solve */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+
+			/* column within antidiagonal for a */
+			for (n = k + 1; n < nblocks; n++)
+			{
+				/* row */
+				m = a-2*k-n;
+
+				if (m >= n && m < nblocks)
+				{
+					/* Update */
+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							       STARPU_R, data_handles[n][k],
+							       STARPU_R, data_handles[m][k],
+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+							       0);
+				}
+			}
+
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
 /*
  *	code to bootstrap the factorization
  *	and construct the DAG
@@ -79,8 +354,6 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_data_handle_t **data_handles;
 	unsigned k, m, n;
 
-	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
-
 	/* create all the DAG nodes */
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
@@ -91,7 +364,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		for(n = 0; n < nblocks ; n++)
 		{
 			int mpi_rank = my_distrib(m, n, nodes);
-			if (mpi_rank == rank)
+			if (mpi_rank == rank || (check && rank == 0))
 			{
 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
 				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
@@ -119,50 +392,17 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	start = starpu_timing_now();
 
-	for (k = 0; k < nblocks; k++)
+	switch (submission)
 	{
-		starpu_iteration_push(k);
-
-		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
-				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
-				       STARPU_RW, data_handles[k][k],
-				       0);
-
-		for (m = k+1; m<nblocks; m++)
-		{
-			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-					       STARPU_R, data_handles[k][k],
-					       STARPU_RW, data_handles[m][k],
-					       0);
-
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
-			if (my_distrib(k, k, nodes) == rank)
-				starpu_data_wont_use(data_handles[k][k]);
-
-			for (n = k+1; n<nblocks; n++)
-			{
-				if (n <= m)
-				{
-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-							       STARPU_R, data_handles[n][k],
-							       STARPU_R, data_handles[m][k],
-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
-							       0);
-				}
-			}
-
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
-			if (my_distrib(m, k, nodes) == rank)
-				starpu_data_wont_use(data_handles[m][k]);
-		}
-		starpu_iteration_pop();
+		case TRIANGLES:		run_cholesky(data_handles, rank, nodes); break;
+		case COLUMNS:		run_cholesky_column(data_handles, rank, nodes); break;
+		case ANTIDIAGONALS:	run_cholesky_antidiagonal(data_handles, rank, nodes); break;
+		case PRIOS:		run_cholesky_prio(data_handles, rank, nodes); break;
+		default: STARPU_ABORT();
 	}
 
 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
-
 	end = starpu_timing_now();
 
 	for (m = 0; m < nblocks; m++)
@@ -170,7 +410,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 		for(n = 0; n < nblocks ; n++)
 		{
 			/* Get back data on node 0 for the check */
-			if (check)
+			if (check && data_handles[m][n])
 				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
 
 			if (data_handles[m][n])
@@ -248,24 +488,20 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	{
 		for (m = 0; m < nblocks; m++)
 		{
-			int mpi_rank = my_distrib(m, n, nodes);
-			if (mpi_rank == rank)
+			for (nn = BLOCKSIZE*n ; nn < BLOCKSIZE*(n+1); nn++)
 			{
-				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
+				for (mm = BLOCKSIZE*m ; mm < BLOCKSIZE*(m+1); mm++)
 				{
-					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
+					if (nn <= mm)
 					{
-						if (nn <= mm)
+						float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
+						float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
+						if (err > epsilon)
 						{
-							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
-							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
-							if (err > epsilon)
-							{
-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
-								*correctness = 0;
-								*flops = 0;
-								break;
-							}
+							FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
+							*correctness = 0;
+							*flops = 0;
+							break;
 						}
 					}
 				}

+ 17 - 1
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -43,6 +43,7 @@ unsigned check = 0;
 unsigned display = 0;
 int dblockx = -1;
 int dblocky = -1;
+enum submission submission = TRIANGLES;
 
 void parse_args(int argc, char **argv, int nodes)
 {
@@ -79,6 +80,21 @@ void parse_args(int argc, char **argv, int nodes)
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                 }
 
+                if (strcmp(argv[i], "-columns") == 0)
+                {
+                        submission = COLUMNS;
+                }
+
+                if (strcmp(argv[i], "-antidiagonals") == 0)
+                {
+                        submission = ANTIDIAGONALS;
+                }
+
+                if (strcmp(argv[i], "-prios") == 0)
+                {
+                        submission = PRIOS;
+                }
+
                 if (strcmp(argv[i], "-no-prio") == 0)
                 {
                         noprio = 1;
@@ -96,7 +112,7 @@ void parse_args(int argc, char **argv, int nodes)
 
                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 {
-			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display] [-check]\n", argv[0]);
+                        printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-prios] [-no-prio] [-display] [-check]\n", argv[0]);
                 }
         }
 

+ 9 - 0
mpi/examples/matrix_decomposition/mpi_decomposition_params.h

@@ -28,6 +28,15 @@ extern unsigned display;
 extern int dblockx;
 extern int dblocky;
 
+enum submission
+{
+	TRIANGLES,
+	COLUMNS,
+	ANTIDIAGONALS,
+	PRIOS,
+};
+extern enum submission submission;
+
 void parse_args(int argc, char **argv, int nodes);
 
 #endif // __MPI_CHOLESKY_PARAMS_H__

+ 15 - 0
mpi/examples/user_datatype/my_interface.c

@@ -15,6 +15,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_mpi.h>
 
 #include "my_interface.h"
 
@@ -314,6 +315,7 @@ void starpu_my_data_register(starpu_data_handle_t *handleptr, unsigned home_node
 	if (interface_data_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
 	{
 		interface_data_ops.interfaceid = starpu_data_interface_get_next_id();
+		starpu_mpi_interface_datatype_register(interface_data_ops.interfaceid, starpu_my_data_datatype_allocate, starpu_my_data_datatype_free);
 	}
 
 	struct starpu_my_data_interface data =
@@ -327,6 +329,12 @@ void starpu_my_data_register(starpu_data_handle_t *handleptr, unsigned home_node
 	starpu_data_register(handleptr, home_node, &data, &interface_data_ops);
 }
 
+void starpu_my_data_shutdown(void)
+{
+	starpu_mpi_interface_datatype_unregister(interface_data_ops.interfaceid);
+
+}
+
 static struct starpu_data_interface_ops interface_data2_ops =
 {
 	.register_data_handle = data_register_data_handle,
@@ -349,6 +357,7 @@ void starpu_my_data2_register(starpu_data_handle_t *handleptr, unsigned home_nod
 	if (interface_data2_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
 	{
 		interface_data2_ops.interfaceid = starpu_data_interface_get_next_id();
+		starpu_mpi_interface_datatype_register(interface_data2_ops.interfaceid, starpu_my_data2_datatype_allocate, starpu_my_data2_datatype_free);
 	}
 
 	struct starpu_my_data_interface data =
@@ -361,3 +370,9 @@ void starpu_my_data2_register(starpu_data_handle_t *handleptr, unsigned home_nod
 
 	starpu_data_register(handleptr, home_node, &data, &interface_data2_ops);
 }
+
+void starpu_my_data2_shutdown(void)
+{
+	starpu_mpi_interface_datatype_unregister(interface_data2_ops.interfaceid);
+
+}

+ 3 - 0
mpi/examples/user_datatype/my_interface.h

@@ -76,4 +76,7 @@ static struct starpu_codelet starpu_my_data_compare_codelet =
 	.name = "starpu_my_data_compare_codelet"
 };
 
+void starpu_my_data_shutdown(void);
+void starpu_my_data2_shutdown(void);
+
 #endif /* __MY_INTERFACE_H */

+ 1 - 4
mpi/examples/user_datatype/user_datatype.c

@@ -57,9 +57,6 @@ int main(int argc, char **argv)
 
 	starpu_my_data_register(&handle0, STARPU_MAIN_RAM, &my0);
 	starpu_my_data_register(&handle1, -1, &my1);
-	starpu_mpi_datatype_register(handle1, starpu_my_data_datatype_allocate, starpu_my_data_datatype_free);
-
-	starpu_mpi_barrier(MPI_COMM_WORLD);
 
 	// Send data directly with MPI
 	if (rank == 0)
@@ -123,10 +120,10 @@ int main(int argc, char **argv)
 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
-	starpu_mpi_datatype_unregister(handle0);
 	starpu_data_unregister(handle0);
 	starpu_data_unregister(handle1);
 
+	starpu_my_data_shutdown();
 	starpu_mpi_shutdown();
 
 	if (rank == 0)

+ 1 - 1
mpi/examples/user_datatype/user_datatype2.c

@@ -57,7 +57,6 @@ int main(int argc, char **argv)
 
 	starpu_my_data2_register(&handle0, STARPU_MAIN_RAM, &my0);
 	starpu_my_data2_register(&handle1, -1, &my1);
-	starpu_mpi_datatype_register(handle1, starpu_my_data2_datatype_allocate, starpu_my_data2_datatype_free);
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 
@@ -87,6 +86,7 @@ int main(int argc, char **argv)
 	starpu_data_unregister(handle0);
 	starpu_data_unregister(handle1);
 
+	starpu_my_data2_shutdown();
 	starpu_mpi_shutdown();
 
 	if (rank == 0)

+ 92 - 0
mpi/examples/user_datatype/user_datatype_early.c

@@ -0,0 +1,92 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "my_interface.h"
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+int main(int argc, char **argv)
+{
+	int rank, nodes;
+	int ret=0;
+
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
+
+	if (nodes < 2 || (starpu_cpu_worker_get_count() == 0))
+	{
+		if (rank == 0)
+		{
+			if (nodes < 2)
+				fprintf(stderr, "We need at least 2 processes.\n");
+			else
+				fprintf(stderr, "We need at least 1 CPU.\n");
+		}
+		starpu_mpi_shutdown();
+		return 77;
+	}
+
+	struct starpu_my_data my0 = {.d = 42 , .c = 'n'};
+	struct starpu_my_data my1 = {.d = 11 , .c = 'a'};
+
+	if (rank == 1)
+	{
+		my0.d *= 2;
+		my0.c += 1;
+		my1.d *= 2;
+		my1.c += 1;
+	}
+
+	starpu_data_handle_t handle0;
+	starpu_data_handle_t handle1;
+	starpu_my_data_register(&handle0, STARPU_MAIN_RAM, &my0);
+	starpu_my_data_register(&handle1, STARPU_MAIN_RAM, &my1);
+
+	if (rank == 0)
+	{
+		starpu_mpi_send(handle0, 1, 10, MPI_COMM_WORLD);
+		starpu_mpi_send(handle1, 1, 20, MPI_COMM_WORLD);
+	}
+	else if (rank == 1)
+	{
+		// We want handle0 to be received as early_data and as starpu_mpi_data_register() has not be called, it will be received as raw memory, and then unpacked with MPI_Unpack()
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 handle0 init value", strlen("node1 handle0 init value")+1, STARPU_R, handle0, 0);
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 handle1 init value", strlen("node1 handle1 init value")+1, STARPU_R, handle1, 0);
+		starpu_mpi_recv(handle1, 0, 20, MPI_COMM_WORLD, NULL);
+		starpu_mpi_recv(handle0, 0, 10, MPI_COMM_WORLD, NULL);
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 handle0 received value", strlen("node1 handle0 received value")+1, STARPU_R, handle0, 0);
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 handle1 received value", strlen("node1 handle1 received value")+1, STARPU_R, handle1, 0);
+	}
+
+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	starpu_data_unregister(handle0);
+	starpu_data_unregister(handle1);
+
+	if (rank == 1)
+	{
+		STARPU_ASSERT_MSG(my0.d == 42 && my0.c == 'n' && my1.d == 11 && my1.c == 'a', "Incorrect received values");
+	}
+
+	starpu_my_data_shutdown();
+	starpu_mpi_shutdown();
+
+	return 0;
+}

+ 1 - 0
mpi/include/starpu_mpi.h

@@ -353,6 +353,7 @@ typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
 /**
    Register functions to create and free a MPI datatype for the given
    handle.
+   Similar to starpu_mpi_interface_datatype_register().
    It is important that the function is called before any
    communication can take place for a data with the given handle. See
    \ref ExchangingUserDefinedDataInterface for an example.

+ 15 - 3
mpi/src/mpi/starpu_mpi_mpi.c

@@ -918,9 +918,21 @@ static void _starpu_mpi_early_data_cb(void* arg)
 		/* Data has been received as a raw memory, it has to be unpacked */
 		struct starpu_data_interface_ops *itf_src = starpu_data_get_interface_ops(args->early_handle);
 		struct starpu_data_interface_ops *itf_dst = starpu_data_get_interface_ops(args->data_handle);
-		STARPU_MPI_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
-		itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->early_handle));
-		args->buffer = NULL;
+		MPI_Datatype datatype = _starpu_mpi_datatype_get_user_defined_datatype(args->data_handle);
+
+		if (datatype)
+		{
+			int position=0;
+			void *ptr = starpu_data_get_local_ptr(args->data_handle);
+			MPI_Unpack(args->buffer, itf_src->get_size(args->early_handle), &position, ptr, 1, datatype, args->req->node_tag.node.comm);
+			_starpu_mpi_datatype_free(args->data_handle, &datatype);
+		}
+		else
+		{
+			STARPU_MPI_ASSERT_MSG(itf_dst->unpack_data, "The data interface does not define an unpack function\n");
+			itf_dst->unpack_data(args->data_handle, STARPU_MAIN_RAM, args->buffer, itf_src->get_size(args->early_handle));
+			args->buffer = NULL;
+		}
 	}
 	else
 	{

+ 21 - 0
mpi/src/starpu_mpi_datatype.c

@@ -208,6 +208,27 @@ static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_I
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 };
 
+MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t data_handle)
+{
+	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
+	if (id < STARPU_MAX_INTERFACE_ID) return 0;
+
+	struct _starpu_mpi_datatype_funcs *table;
+	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_datatype_funcs_table_mutex);
+	HASH_FIND_INT(_starpu_mpi_datatype_funcs_table, &id, table);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_datatype_funcs_table_mutex);
+	if (table && table->allocate_datatype_func)
+	{
+		MPI_Datatype datatype;
+		int ret = table->allocate_datatype_func(data_handle, &datatype);
+		if (ret == 0)
+			return datatype;
+		else
+			return 0;
+	}
+	return 0;
+}
+
 void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req)
 {
 	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);

+ 2 - 0
mpi/src/starpu_mpi_datatype.h

@@ -31,6 +31,8 @@ void _starpu_mpi_datatype_shutdown(void);
 void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req);
 void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
 
+MPI_Datatype _starpu_mpi_datatype_get_user_defined_datatype(starpu_data_handle_t data_handle);
+
 #ifdef __cplusplus
 }
 #endif

+ 2 - 2
mpi/tests/Makefile.am

@@ -137,13 +137,13 @@ starpu_mpi_TESTS +=				\
 	ring_sync_detached			\
 	temporary				\
 	user_defined_datatype			\
-	early_stuff				\
-	display_bindings
+	early_stuff
 
 if !STARPU_SIMGRID
 # missing support in simgrid
 starpu_mpi_TESTS +=				\
 	attr					\
+	display_bindings			\
 	mpi_earlyrecv2				\
 	mpi_earlyrecv2_sync			\
 	block_interface				\

+ 1 - 1
src/common/utils.c

@@ -528,7 +528,7 @@ void _starpu_gethostname(char *hostname, size_t size)
 
 	if (force_mpi_hostnames && force_mpi_hostnames[0])
 	{
-		char *host, *srv_hosts;
+		char *host=NULL, *srv_hosts;
 		srv_hosts = strdup(force_mpi_hostnames);
 		int rank;
 		if (starpu_mpi_world_rank)

+ 4 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -318,9 +318,11 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
 	{
 		cures = cudaDeviceCanAccessPeer(&can, src, dst);
+		(void) cudaGetLastError();
 		if (!cures && can)
 		{
 			cures = cudaDeviceEnablePeerAccess(dst, 0);
+			(void) cudaGetLastError();
 			if (!cures)
 			{
 				_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
@@ -344,9 +346,11 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
 	{
 		cures = cudaDeviceCanAccessPeer(&can, dst, src);
+		(void) cudaGetLastError();
 		if (!cures && can)
 		{
 			cures = cudaDeviceEnablePeerAccess(src, 0);
+			(void) cudaGetLastError();
 			if (!cures)
 			{
 				_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);

+ 2 - 2
src/core/topology.c

@@ -3072,10 +3072,10 @@ int starpu_get_pu_os_index(unsigned logical_index)
 }
 
 #ifdef STARPU_HAVE_HWLOC
-int starpu_get_hwloc_topology(hwloc_topology_t* topology)
+hwloc_topology_t starpu_get_hwloc_topology(void)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
-	return hwloc_topology_dup(topology, config->topology.hwtopology);
+	return config->topology.hwtopology;
 }
 #endif

+ 32 - 17
src/drivers/cuda/driver_cuda.c

@@ -336,9 +336,13 @@ static void init_device_context(unsigned devid, unsigned memnode)
 			{
 				int can;
 				cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
+				(void) cudaGetLastError();
+
 				if (!cures && can)
 				{
 					cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
+					(void) cudaGetLastError();
+
 					if (!cures)
 					{
 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
@@ -883,27 +887,33 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			_starpu_set_local_worker_key(worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
 			/* See next task if any */
-			if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
+			if (worker->ntasks)
 			{
-				task = worker->current_tasks[worker->first_task];
-				j = _starpu_get_job_associated_to_task(task);
-				if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+				if (worker->current_tasks[worker->first_task] != worker->task_transferring)
 				{
-					/* An asynchronous task, it was already
-					 * queued, it's now running, record its start time.  */
-					_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
+					task = worker->current_tasks[worker->first_task];
+					j = _starpu_get_job_associated_to_task(task);
+					if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
+					{
+						/* An asynchronous task, it was already
+						 * queued, it's now running, record its start time.  */
+						_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
+					}
+					else
+					{
+						/* A synchronous task, we have finished
+						 * flushing the pipeline, we can now at
+						 * last execute it.  */
+
+						_STARPU_TRACE_EVENT("sync_task");
+						execute_job_on_cuda(task, worker);
+						_STARPU_TRACE_EVENT("end_sync_task");
+						worker->pipeline_stuck = 0;
+					}
 				}
 				else
-				{
-					/* A synchronous task, we have finished
-					 * flushing the pipeline, we can now at
-					 * last execute it.  */
-
-					_STARPU_TRACE_EVENT("sync_task");
-					execute_job_on_cuda(task, worker);
-					_STARPU_TRACE_EVENT("end_sync_task");
-					worker->pipeline_stuck = 0;
-				}
+					/* Data for next task didn't have time to finish transferring :/ */
+					_STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
 			}
 #ifdef STARPU_USE_FXT
 			int k;
@@ -1164,6 +1174,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 		{
 			cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
 		}
+		(void) cudaGetLastError();
 		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	}
 
@@ -1183,6 +1194,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 		{
 			cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
 		}
+		(void) cudaGetLastError();
 
 		if (!cures)
 			cures = cudaDeviceSynchronize();
@@ -1240,12 +1252,14 @@ starpu_cuda_copy2d_async_sync(void *src_ptr, unsigned src_node,
 			double start;
 			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			cures = cudaMemcpy3DPeerAsync(&p, stream);
+			(void) cudaGetLastError();
 		}
 
 		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
 		if (stream == NULL || cures)
 		{
 			cures = cudaMemcpy3DPeer(&p);
+			(void) cudaGetLastError();
 
 			if (!cures)
 				cures = cudaDeviceSynchronize();
@@ -1337,6 +1351,7 @@ starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node,
 		if (stream == NULL || cures)
 		{
 			cures = cudaMemcpy3DPeer(&p);
+			(void) cudaGetLastError();
 
 			if (!cures)
 				cures = cudaDeviceSynchronize();

+ 4 - 1
tests/Makefile.am

@@ -148,7 +148,6 @@ myPROGRAMS =
 myPROGRAMS +=					\
 	main/bind				\
 	main/mkdtemp				\
-	main/display_binding			\
 	main/execute_schedule			\
 	main/insert_task_pack			\
 	main/insert_task_nullcodelet		\
@@ -225,6 +224,7 @@ myPROGRAMS +=				\
 	main/driver_api/init_run_deinit         \
 	main/driver_api/run_driver              \
 	main/deploop                            \
+	main/display_binding			\
 	main/execute_on_a_specific_worker	\
 	main/insert_task			\
 	main/insert_task_value			\
@@ -404,6 +404,9 @@ endif
 endif
 
 examplebin_PROGRAMS = \
+	microbenchs/async_tasks_overhead	\
+	microbenchs/sync_tasks_overhead		\
+	microbenchs/tasks_overhead		\
 	microbenchs/tasks_size_overhead		\
 	microbenchs/local_pingpong
 examplebin_SCRIPTS = \

+ 3 - 3
tests/perfmodels/regression_based.c

@@ -37,7 +37,7 @@ static void memset_cuda(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
@@ -53,7 +53,7 @@ void memset0_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned i;
 
@@ -66,7 +66,7 @@ void memset_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	starpu_usleep(10);

+ 1 - 1
tests/perfmodels/regression_based_01.c

@@ -42,7 +42,7 @@ void memset_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned i;
 

+ 2 - 2
tests/perfmodels/regression_based_02.c

@@ -39,7 +39,7 @@ void memset0_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned i;
 
@@ -57,7 +57,7 @@ void memset_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	int i;
 

+ 2 - 2
tests/perfmodels/regression_based_03.c

@@ -40,7 +40,7 @@ void memset0_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned i;
 
@@ -58,7 +58,7 @@ void memset_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	int i;
 

+ 3 - 3
tests/perfmodels/regression_based_04.c

@@ -40,7 +40,7 @@ static void memset_cuda(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
@@ -57,7 +57,7 @@ void memset0_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned i;
 
@@ -73,7 +73,7 @@ void memset_cpu(void *descr[], void *arg)
 	(void)arg;
 	STARPU_SKIP_IF_VALGRIND;
 
-	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned *ptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	//starpu_usleep(10);