Browse Source

merge mpi

Andra Hugo 13 years ago
parent
commit
c5c7b64348

+ 278 - 6
mpi/Makefile.am

@@ -14,16 +14,288 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-SUBDIRS=src tests examples
+CC=$(MPICC)
+CCLD=$(MPICC)
+
+if STARPU_MPI_CHECK
+TESTS_ENVIRONMENT	=	$(MPIEXEC) -np 2
+TESTS			=	$(check_PROGRAMS)
+endif
+
+check_PROGRAMS =
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo
+
+EXTRA_DIST = 					\
+	examples/mpi_lu/float.h			\
+	examples/mpi_lu/double.h		\
+	examples/mpi_lu/plu_example.c		\
+	examples/mpi_lu/plu_solve.c		\
+	examples/mpi_lu/pxlu.h			\
+	examples/mpi_lu/pxlu.c			\
+	examples/mpi_lu/pxlu_kernels.h		\
+	examples/mpi_lu/pxlu_kernels.c		\
+	examples/cholesky/mpi_cholesky.h	\
+	examples/cholesky/mpi_cholesky_models.h \
+	tests/helper.h
 
 
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc
 pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc
 
 
+examplebindir = $(libdir)/starpu/examples/mpi
+
+examplebin_PROGRAMS =
+
+if STARPU_USE_CUDA
+# TODO define NVCCFLAGS
+NVCC ?= nvcc
+
+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
+
+.cu.cubin:
+	$(MKDIR_P) `dirname $@`
+	$(NVCC) -cubin $< -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
+
+.cu.o:
+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/  -I$(top_builddir)/include/
+endif
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS)
+LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/mpi/ -I$(top_srcdir)/src/  -I$(top_srcdir)/examples/ -I$(top_builddir)/src -I$(top_builddir)/include
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+
+lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
+  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE)
+
+noinst_HEADERS =					\
+	starpu_mpi_private.h				\
+	starpu_mpi_fxt.h				\
+	starpu_mpi_insert_task_cache.h
+
 versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
 versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
-versinclude_HEADERS = 					\
-	include/starpu_mpi.h
+versinclude_HEADERS = 				\
+	starpu_mpi.h					\
+	starpu_mpi_datatype.h
+
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
+	starpu_mpi.c					\
+	starpu_mpi_helper.c				\
+	starpu_mpi_datatype.c				\
+	starpu_mpi_insert_task.c			\
+	starpu_mpi_insert_task_cache.c			\
+	starpu_mpi_collective.c
+
+###################
+# Stencil example #
+###################
+
+examplebin_PROGRAMS +=				\
+	examples/stencil/stencil5
+
+examples_stencil_stencil5_LDADD =		\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+check_PROGRAMS	+=	\
+	examples/stencil/stencil5
+
+##################
+# MPI LU example #
+##################
+
+if !NO_BLAS_LIB
+
+examplebin_PROGRAMS += 				\
+	examples/mpi_lu/plu_example_float	\
+	examples/mpi_lu/plu_example_double
+
+examples_mpi_lu_plu_example_float_LDADD =	\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la				\
+	$(STARPU_LIBNUMA_LDFLAGS)		\
+	$(STARPU_BLAS_LDFLAGS)
+
+examples_mpi_lu_plu_example_float_SOURCES =	\
+	examples/mpi_lu/plu_example_float.c	\
+	examples/mpi_lu/plu_solve_float.c	\
+	examples/mpi_lu/pslu_kernels.c		\
+	examples/mpi_lu/pslu.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+examples_mpi_lu_plu_example_double_LDADD =	\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la				\
+	$(STARPU_LIBNUMA_LDFLAGS)		\
+	$(STARPU_BLAS_LDFLAGS)
+
+examples_mpi_lu_plu_example_double_SOURCES =	\
+	examples/mpi_lu/plu_example_double.c	\
+	examples/mpi_lu/plu_solve_double.c  	\
+	examples/mpi_lu/pdlu_kernels.c	    	\
+	examples/mpi_lu/pdlu.c		    	\
+	$(top_srcdir)/examples/common/blas.c
+endif
+
+########################
+# MPI Cholesky example #
+########################
+
+if !NO_BLAS_LIB
+examplebin_PROGRAMS +=		\
+	examples/cholesky/mpi_cholesky			\
+	examples/cholesky/mpi_cholesky_distributed
+
+examples_cholesky_mpi_cholesky_SOURCES	=		\
+	examples/cholesky/mpi_cholesky.c		\
+	examples/cholesky/mpi_cholesky_models.c		\
+	examples/cholesky/mpi_cholesky_kernels.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+examples_cholesky_mpi_cholesky_LDADD =			\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la					\
+	$(STARPU_BLAS_LDFLAGS)
+
+examples_cholesky_mpi_cholesky_distributed_SOURCES =	\
+	examples/cholesky/mpi_cholesky_distributed.c	\
+	examples/cholesky/mpi_cholesky_models.c		\
+	examples/cholesky/mpi_cholesky_kernels.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+examples_cholesky_mpi_cholesky_distributed_LDADD =	\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la					\
+	$(STARPU_BLAS_LDFLAGS)
+
+check_PROGRAMS +=					\
+	examples/cholesky/mpi_cholesky			\
+	examples/cholesky/mpi_cholesky_distributed
+endif
+
+########################
+# Scatter Gather       #
+########################
+
+examplebin_PROGRAMS +=		\
+	examples/scatter_gather/mpi_scatter_gather
+
+examples_scatter_gather_mpi_scatter_gather_LDADD =	\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+check_PROGRAMS +=		\
+	examples/scatter_gather/mpi_scatter_gather
+
+###################
+# Reduction       #
+###################
+
+examplebin_PROGRAMS +=		\
+	examples/reduction/mpi_reduction
+
+examples_reduction_mpi_reduction_SOURCES =		\
+	examples/reduction/mpi_reduction.c		\
+	examples/reduction/mpi_reduction_kernels.c
+
+examples_reduction_mpi_reduction_LDADD =	\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+check_PROGRAMS +=		\
+	examples/reduction/mpi_reduction
+
+########################
+# Unit testcases       #
+########################
+
+check_PROGRAMS +=					\
+	tests/pingpong					\
+	tests/mpi_test					\
+	tests/mpi_isend					\
+	tests/mpi_irecv					\
+	tests/mpi_isend_detached			\
+	tests/mpi_irecv_detached			\
+	tests/mpi_detached_tag				\
+	tests/ring					\
+	tests/ring_async				\
+	tests/ring_async_implicit			\
+	tests/block_interface				\
+	tests/block_interface_pinned			\
+	tests/insert_task				\
+	tests/insert_task_cache				\
+	tests/insert_task_block				\
+	tests/insert_task_owner				\
+	tests/insert_task_owner2			\
+	tests/insert_task_owner_data			\
+	tests/multiple_send
+
+noinst_PROGRAMS =					\
+	tests/pingpong					\
+	tests/mpi_test					\
+	tests/mpi_isend					\
+	tests/mpi_irecv					\
+	tests/mpi_isend_detached			\
+	tests/mpi_irecv_detached			\
+	tests/mpi_detached_tag				\
+	tests/ring					\
+	tests/ring_async				\
+	tests/ring_async_implicit			\
+	tests/block_interface				\
+	tests/block_interface_pinned			\
+	tests/insert_task				\
+	tests/insert_task_cache				\
+	tests/insert_task_block				\
+	tests/insert_task_owner				\
+	tests/insert_task_owner2			\
+	tests/insert_task_owner_data			\
+	tests/multiple_send
+
+tests_mpi_isend_LDADD =					\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_mpi_irecv_LDADD =					\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_mpi_isend_detached_LDADD =			\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_mpi_irecv_detached_LDADD =			\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_mpi_detached_tag_LDADD =				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_pingpong_LDADD =					\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_mpi_test_LDADD =					\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_ring_LDADD =					\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_ring_async_LDADD =				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_ring_async_implicit_LDADD =			\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_block_interface_LDADD =				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_block_interface_pinned_LDADD =			\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_insert_task_LDADD =				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_insert_task_cache_LDADD =				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_insert_task_block_LDADD =				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_insert_task_owner_LDADD =				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_insert_task_owner2_LDADD =			\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_insert_task_owner_data_LDADD =			\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+tests_multiple_send_LDADD =				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+tests_ring_SOURCES = tests/ring.c
+tests_ring_async_SOURCES = tests/ring_async.c
+tests_ring_async_implicit_SOURCES = tests/ring_async_implicit.c
+if STARPU_USE_CUDA
+tests_ring_SOURCES += tests/ring_kernel.cu
+tests_ring_async_SOURCES += tests/ring_kernel.cu
+tests_ring_async_implicit_SOURCES += tests/ring_kernel.cu
+endif
 
 
 showcheck:
 showcheck:
-	for i in $(SUBDIRS) ; do \
-		make -C $$i showcheck ; \
-	done
+	-cat $(TEST_LOGS) /dev/null

+ 222 - 92
mpi/examples/cholesky/mpi_cholesky.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
@@ -19,13 +19,162 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
 #include "mpi_cholesky.h"
 #include "mpi_cholesky.h"
 #include "mpi_cholesky_models.h"
 #include "mpi_cholesky_models.h"
-#include "mpi_cholesky_codelets.h"
+
+/*
+ *	Create the codelets
+ */
+
+static struct starpu_codelet cl11 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &chol_model_11
+};
+
+static struct starpu_codelet cl21 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &chol_model_21
+};
+
+static struct starpu_codelet cl22 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#endif
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &chol_model_22
+};
 
 
 /* Returns the MPI node number where data indexes index is */
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int y, int nb_nodes)
 int my_distrib(int x, int y, int nb_nodes)
 {
 {
-	//return (x+y) % nb_nodes;
-	return (x%dblockx)+(y%dblocky)*dblockx;
+        return (x+y) % nb_nodes;
+}
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+static void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes)
+{
+	struct timeval start;
+	struct timeval end;
+        starpu_data_handle_t **data_handles;
+        int x, y;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+        data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
+        for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
+
+        for(x = 0; x < nblocks ;  x++)
+	{
+                for (y = 0; y < nblocks; y++)
+		{
+                        int mpi_rank = my_distrib(x, y, nodes);
+                        if (mpi_rank == rank)
+			{
+                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                                starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
+                                                            ld, size/nblocks, size/nblocks, sizeof(float));
+                        }
+			/* TODO: make better test to only registering what is needed */
+                        else
+			{
+                                /* I don't own that index, but will need it for my computations */
+                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
+                                                            ld, size/nblocks, size/nblocks, sizeof(float));
+                        }
+                        if (data_handles[x][y])
+			{
+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
+                                starpu_data_set_tag(data_handles[x][y], (y*nblocks)+x);
+			}
+                }
+        }
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	gettimeofday(&start, NULL);
+
+	for (k = 0; k < nblocks; k++)
+        {
+                int prio = STARPU_DEFAULT_PRIO;
+                if (!noprio) prio = STARPU_MAX_PRIO;
+
+                starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
+                                       STARPU_PRIORITY, prio,
+                                       STARPU_RW, data_handles[k][k],
+                                       0);
+
+		for (j = k+1; j<nblocks; j++)
+		{
+                        prio = STARPU_DEFAULT_PRIO;
+                        if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
+                        starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
+                                               STARPU_PRIORITY, prio,
+                                               STARPU_R, data_handles[k][k],
+                                               STARPU_RW, data_handles[k][j],
+                                               0);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+                                {
+                                        prio = STARPU_DEFAULT_PRIO;
+                                        if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
+                                        starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
+                                                               STARPU_PRIORITY, prio,
+                                                               STARPU_R, data_handles[k][i],
+                                                               STARPU_R, data_handles[k][j],
+                                                               STARPU_RW, data_handles[i][j],
+                                                               0);
+                                }
+			}
+		}
+        }
+
+        starpu_task_wait_for_all();
+
+        for(x = 0; x < nblocks ;  x++)
+	{
+                for (y = 0; y < nblocks; y++)
+		{
+                        if (data_handles[x][y])
+                                starpu_data_unregister(data_handles[x][y]);
+                }
+		free(data_handles[x]);
+        }
+	free(data_handles);
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	gettimeofday(&end, NULL);
+
+	if (rank == 0)
+	{
+		double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+		fprintf(stderr, "Computation took (in ms)\n");
+		fprintf(stdout, "%2.2f\n", timing/1000);
+
+		double flop = (1.0f*size*size*size)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
 }
 }
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
@@ -36,45 +185,35 @@ int main(int argc, char **argv)
 	 * */
 	 * */
 
 
 	float ***bmat;
 	float ***bmat;
-	int rank, nodes, ret;
+        int rank, nodes;
 
 
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-	ret = starpu_init(NULL);
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	conf.sched_policy_name = "heft";
+	conf.calibrate = 1;
+
+	int ret = starpu_init(&conf);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	starpu_mpi_initialize_extended(&rank, &nodes);
 	starpu_mpi_initialize_extended(&rank, &nodes);
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
-	if (dblockx == -1 || dblocky == -1)
-	{
-	     int factor;
-	     dblockx = nodes;
-	     dblocky = 1;
-	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
-	     {
-		  if (nodes % factor == 0)
-		  {
-		       dblockx = nodes/factor;
-		       dblocky = factor;
-		       break;
-		  }
-	     }
-	}
-
 	unsigned i,j,x,y;
 	unsigned i,j,x,y;
-	bmat = malloc(nblocks * sizeof(float *));
-	for(x=0 ; x<nblocks ; x++)
+        bmat = malloc(nblocks * sizeof(float *));
+        for(x=0 ; x<nblocks ; x++)
 	{
 	{
-		bmat[x] = malloc(nblocks * sizeof(float *));
-		for(y=0 ; y<nblocks ; y++)
+                bmat[x] = malloc(nblocks * sizeof(float *));
+                for(y=0 ; y<nblocks ; y++)
 		{
 		{
-			starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+                        starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
 			for (i = 0; i < BLOCKSIZE; i++)
 			for (i = 0; i < BLOCKSIZE; i++)
 			{
 			{
 				for (j = 0; j < BLOCKSIZE; j++)
 				for (j = 0; j < BLOCKSIZE; j++)
 				{
 				{
-					bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
+                                        bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
 					//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
 					//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
 				}
 				}
 			}
 			}
@@ -82,15 +221,15 @@ int main(int argc, char **argv)
 	}
 	}
 
 
 
 
-	if (display)
+        if (display)
 	{
 	{
-		printf("[%d] Input :\n", rank);
+                printf("[%d] Input :\n", rank);
 
 
 		for(y=0 ; y<nblocks ; y++)
 		for(y=0 ; y<nblocks ; y++)
 		{
 		{
 			for(x=0 ; x<nblocks ; x++)
 			for(x=0 ; x<nblocks ; x++)
 			{
 			{
-				printf("Block %u,%u :\n", x, y);
+                                printf("Block %d,%d :\n", x, y);
 				for (j = 0; j < BLOCKSIZE; j++)
 				for (j = 0; j < BLOCKSIZE; j++)
 				{
 				{
 					for (i = 0; i < BLOCKSIZE; i++)
 					for (i = 0; i < BLOCKSIZE; i++)
@@ -110,19 +249,18 @@ int main(int argc, char **argv)
 		}
 		}
 	}
 	}
 
 
-	double timing, flops;
-	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
+	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes);
 
 
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 
 
-	if (display)
+        if (display)
 	{
 	{
-		printf("[%d] Results :\n", rank);
+                printf("[%d] Results :\n", rank);
 		for(y=0 ; y<nblocks ; y++)
 		for(y=0 ; y<nblocks ; y++)
 		{
 		{
 			for(x=0 ; x<nblocks ; x++)
 			for(x=0 ; x<nblocks ; x++)
 			{
 			{
-				printf("Block %u,%u :\n", x, y);
+                                printf("Block %d,%d :\n", x, y);
 				for (j = 0; j < BLOCKSIZE; j++)
 				for (j = 0; j < BLOCKSIZE; j++)
 				{
 				{
 					for (i = 0; i < BLOCKSIZE; i++)
 					for (i = 0; i < BLOCKSIZE; i++)
@@ -143,19 +281,19 @@ int main(int argc, char **argv)
 	}
 	}
 
 
 	float *rmat = malloc(size*size*sizeof(float));
 	float *rmat = malloc(size*size*sizeof(float));
-	for(x=0 ; x<nblocks ; x++)
+        for(x=0 ; x<nblocks ; x++)
 	{
 	{
-		for(y=0 ; y<nblocks ; y++)
+                for(y=0 ; y<nblocks ; y++)
 		{
 		{
-			for (i = 0; i < BLOCKSIZE; i++)
+                        for (i = 0; i < BLOCKSIZE; i++)
 			{
 			{
-				for (j = 0; j < BLOCKSIZE; j++)
+                                for (j = 0; j < BLOCKSIZE; j++)
 				{
 				{
-					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
-				}
-			}
-		}
-	}
+                                        rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
+                                }
+                        }
+                }
+        }
 
 
 	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
 	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
@@ -172,63 +310,62 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(test_mat);
 	STARPU_ASSERT(test_mat);
 
 
 	SSYRK("L", "N", size, size, 1.0f,
 	SSYRK("L", "N", size, size, 1.0f,
-			rmat, size, 0.0f, test_mat, size);
+				rmat, size, 0.0f, test_mat, size);
 
 
 	fprintf(stderr, "[%d] comparing results ...\n", rank);
 	fprintf(stderr, "[%d] comparing results ...\n", rank);
-	if (display)
+        if (display)
 	{
 	{
-		for (j = 0; j < size; j++)
+                for (j = 0; j < size; j++)
 		{
 		{
-			for (i = 0; i < size; i++)
+                        for (i = 0; i < size; i++)
 			{
 			{
-				if (i <= j)
+                                if (i <= j)
 				{
 				{
-					printf("%2.2f\t", test_mat[j +i*size]);
-				}
-				else
+                                        printf("%2.2f\t", test_mat[j +i*size]);
+                                }
+                                else
 				{
 				{
-					printf(".\t");
-				}
-			}
-			printf("\n");
-		}
-	}
+                                        printf(".\t");
+                                }
+                        }
+                        printf("\n");
+                }
+        }
 
 
 	int correctness = 1;
 	int correctness = 1;
-	for(x = 0; x < nblocks ;  x++)
+        for(x = 0; x < nblocks ;  x++)
 	{
 	{
-		for (y = 0; y < nblocks; y++)
+                for (y = 0; y < nblocks; y++)
 		{
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
-			if (mpi_rank == rank)
+                        int mpi_rank = my_distrib(x, y, nodes);
+                        if (mpi_rank == rank)
 			{
 			{
-				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
-				{
-					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
-					{
-						if (i <= j)
-						{
-							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-							float err = abs(test_mat[j +i*size] - orig);
-							if (err > 0.00001)
+                                for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
+                                {
+                                        for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
+                                        {
+                                                if (i <= j)
+                                                {
+                                                        float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+                                                        float err = abs(test_mat[j +i*size] - orig);
+                                                        if (err > 0.00001)
 							{
 							{
-								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+                                                                fprintf(stderr, "[%d] Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
 								correctness = 0;
 								correctness = 0;
-								flops = 0;
 								break;
 								break;
-							}
-						}
-					}
-				}
-			}
-		}
-	}
+                                                        }
+                                                }
+                                        }
+                                }
+                        }
+                }
+        }
 
 
-	for(x=0 ; x<nblocks ; x++)
+        for(x=0 ; x<nblocks ; x++)
 	{
 	{
-		for(y=0 ; y<nblocks ; y++)
+                for(y=0 ; y<nblocks ; y++)
 		{
 		{
-			starpu_free((void *)bmat[x][y]);
+                        starpu_free((void *)bmat[x][y]);
 		}
 		}
 		free(bmat[x]);
 		free(bmat[x]);
 	}
 	}
@@ -236,16 +373,9 @@ int main(int argc, char **argv)
 	free(rmat);
 	free(rmat);
 	free(test_mat);
 	free(test_mat);
 
 
-	starpu_helper_cublas_shutdown();
+        starpu_helper_cublas_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 	assert(correctness);
 	assert(correctness);
-
-	if (rank == 0)
-	{
-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
-	}
-
 	return 0;
 	return 0;
 }
 }

+ 5 - 19
mpi/examples/cholesky/mpi_cholesky.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,8 +15,8 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
-#ifndef __MPI_CHOLESKY_H__
-#define __MPI_CHOLESKY_H__
+#ifndef __DW_CHOLESKY_H__
+#define __DW_CHOLESKY_H__
 
 
 #include <string.h>
 #include <string.h>
 #include <math.h>
 #include <math.h>
@@ -34,11 +34,9 @@
 
 
 static unsigned size = 4*1024;
 static unsigned size = 4*1024;
 static unsigned nblocks = 16;
 static unsigned nblocks = 16;
-static unsigned nbigblocks = 2;
+static unsigned nbigblocks = 8;
 static unsigned noprio = 0;
 static unsigned noprio = 0;
 static unsigned display = 0;
 static unsigned display = 0;
-static unsigned dblockx = -1;
-static unsigned dblocky = -1;
 
 
 void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
@@ -61,18 +59,6 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 			size = strtol(argv[++i], &argptr, 10);
 			size = strtol(argv[++i], &argptr, 10);
 		}
 		}
 
 
-		if (strcmp(argv[i], "-dblockx") == 0)
-		{
-		        char *argptr;
-			dblockx = strtol(argv[++i], &argptr, 10);
-		}
-		
-		if (strcmp(argv[i], "-dblocky") == 0)
-		{
-		        char *argptr;
-			dblocky = strtol(argv[++i], &argptr, 10);
-		}
-	
 		if (strcmp(argv[i], "-nblocks") == 0)
 		if (strcmp(argv[i], "-nblocks") == 0)
 		{
 		{
 		        char *argptr;
 		        char *argptr;
@@ -103,4 +89,4 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 	if (nblocks > size) nblocks = size;
 	if (nblocks > size) nblocks = size;
 }
 }
 
 
-#endif // __MPI_CHOLESKY_H__
+#endif // __DW_CHOLESKY_H__

+ 10 - 11
mpi/examples/cholesky/mpi_cholesky_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
-#include <starpu.h>
+#include <starpu_config.h>
 #include "mpi_cholesky.h"
 #include "mpi_cholesky.h"
 #include "common/blas.h"
 #include "common/blas.h"
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -63,10 +63,9 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 					-1.0f, left, ld21, right, ld12,
 					-1.0f, left, ld21, right, ld12,
 					 1.0f, center, ld22);
 					 1.0f, center, ld22);
 			st = cublasGetError();
 			st = cublasGetError();
-			if (STARPU_UNLIKELY(st != CUBLAS_STATUS_SUCCESS))
-				STARPU_CUBLAS_REPORT_ERROR(st);
+			STARPU_ASSERT(!st);
 
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaThreadSynchronize();
 
 
 			break;
 			break;
 #endif
 #endif
@@ -115,7 +114,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaThreadSynchronize();
 			break;
 			break;
 #endif
 #endif
 		default:
 		default:
@@ -189,15 +188,15 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 					fprintf(stderr, "Error in Magma: %d\n", ret);
 					fprintf(stderr, "Error in Magma: %d\n", ret);
 					STARPU_ABORT();
 					STARPU_ABORT();
 				}
 				}
-				cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaError_t cures = cudaThreadSynchronize();
 				STARPU_ASSERT(!cures);
 				STARPU_ASSERT(!cures);
 			}
 			}
 #else
 #else
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				float lambda11;
 				float lambda11;
-				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
+				cudaStreamSynchronize(0);
 
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 				STARPU_ASSERT(lambda11 != 0.0f);
 
 
@@ -212,7 +211,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 							&sub11[(z+1)+(z+1)*ld], ld);
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 			}
 
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaThreadSynchronize();
 #endif
 #endif
 			break;
 			break;
 #endif
 #endif

+ 10 - 10
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -68,9 +68,9 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
 
 			status = cublasGetError();
 			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
-				STARPU_CUBLAS_REPORT_ERROR(status);
+				STARPU_ABORT();
 
 
-			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 
 			break;
 			break;
@@ -173,9 +173,9 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 
 
 			status = cublasGetError();
 			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
-				STARPU_CUBLAS_REPORT_ERROR(status);
+				STARPU_ABORT();
 
 
-			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 
 			break;
 			break;
@@ -280,9 +280,9 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 
 
 			status = cublasGetError();
 			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
 			if (status != CUBLAS_STATUS_SUCCESS)
-				STARPU_CUBLAS_REPORT_ERROR(status);
+				STARPU_ABORT();
 
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaThreadSynchronize();
 
 
 			break;
 			break;
 #endif
 #endif
@@ -381,8 +381,8 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				TYPE pivot;
 				TYPE pivot;
-				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
+				cudaStreamSynchronize(0);
 
 
 				STARPU_ASSERT(pivot != 0.0);
 				STARPU_ASSERT(pivot != 0.0);
 				
 				
@@ -394,7 +394,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 						&sub11[(z+1) + (z+1)*ld],ld);
 						&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			}
 			
 			
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaThreadSynchronize();
 
 
 			break;
 			break;
 #endif
 #endif

+ 1 - 1
mpi/examples/stencil/stencil5.c

@@ -149,7 +149,7 @@ int main(int argc, char **argv)
                         fprintf(stdout, "[%d] ", my_rank);
                         fprintf(stdout, "[%d] ", my_rank);
                         for (y = 0; y < Y; y++)
                         for (y = 0; y < Y; y++)
 			{
 			{
-                                fprintf(stdout, "%3u ", matrix[x][y]);
+                                fprintf(stdout, "%3d ", matrix[x][y]);
                         }
                         }
                         fprintf(stdout, "\n");
                         fprintf(stdout, "\n");
                 }
                 }

+ 78 - 0
mpi/starpu_mpi_collective.c

@@ -0,0 +1,78 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <mpi.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
+{
+	int rank;
+	int x;
+
+	MPI_Comm_rank(comm, &rank);
+
+	for(x = 0; x < count ;  x++)
+	{
+		if (data_handles[x])
+		{
+			int owner = starpu_data_get_rank(data_handles[x]);
+			int mpi_tag = starpu_data_get_tag(data_handles[x]);
+			STARPU_ASSERT(mpi_tag >= 0);
+			if ((rank == root) && (owner != root))
+			{
+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
+				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
+			}
+			if ((rank != root) && (owner == rank))
+			{
+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
+				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
+			}
+		}
+	}
+	return 0;
+}
+
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
+{
+	int rank;
+	int x;
+
+	MPI_Comm_rank(comm, &rank);
+
+	for(x = 0; x < count ;  x++)
+	{
+		if (data_handles[x])
+		{
+			int owner = starpu_data_get_rank(data_handles[x]);
+			int mpi_tag = starpu_data_get_tag(data_handles[x]);
+			STARPU_ASSERT(mpi_tag >= 0);
+			if ((rank == root) && (owner != root))
+			{
+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
+				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
+			}
+			if ((rank != root) && (owner == rank))
+			{
+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
+				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
+			}
+		}
+	}
+	return 0;
+}
+

+ 95 - 0
mpi/starpu_mpi_insert_task_cache.c

@@ -0,0 +1,95 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_insert_task_cache.h>
+#include <starpu_hash.h>
+#include <common/htable32.h>
+
+typedef struct _starpu_mpi_clear_cache_s {
+        starpu_data_handle_t data;
+        int rank;
+        int mode;
+} _starpu_mpi_clear_cache_t;
+
+struct starpu_htbl32_node **sent_data = NULL;
+struct starpu_htbl32_node **received_data = NULL;
+
+void _starpu_mpi_clear_cache_callback(void *callback_arg)
+{
+        _starpu_mpi_clear_cache_t *clear_cache = (_starpu_mpi_clear_cache_t *)callback_arg;
+        uint32_t key = starpu_crc32_be((uintptr_t)clear_cache->data, 0);
+
+        if (clear_cache->mode == _STARPU_MPI_CLEAR_SENT_DATA) {
+                _STARPU_MPI_DEBUG("Clearing sent cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
+                _starpu_htbl_insert_32(&sent_data[clear_cache->rank], key, NULL);
+        }
+        else if (clear_cache->mode == _STARPU_MPI_CLEAR_RECEIVED_DATA) {
+                _STARPU_MPI_DEBUG("Clearing received cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
+                _starpu_htbl_insert_32(&received_data[clear_cache->rank], key, NULL);
+        }
+
+        free(clear_cache);
+}
+
+double _starpu_mpi_clear_cache_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	return 0;
+}
+
+static struct starpu_perfmodel _starpu_mpi_clear_cache_model =
+{
+	.cost_function = _starpu_mpi_clear_cache_cost_function,
+	.type = STARPU_COMMON,
+};
+
+static void _starpu_mpi_clear_cache_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+}
+
+static struct starpu_codelet _starpu_mpi_clear_cache_codelet =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {_starpu_mpi_clear_cache_func, NULL},
+	.cuda_funcs = {_starpu_mpi_clear_cache_func, NULL},
+	.opencl_funcs = {_starpu_mpi_clear_cache_func, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &_starpu_mpi_clear_cache_model
+	// The model has a cost function which returns 0 so as to allow the codelet to be scheduled anywhere
+};
+
+void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode)
+{
+        struct starpu_task *task = starpu_task_create();
+
+	// We have a codelet with a empty function just to force the
+	// task being created to have a dependency on data_handle
+        task->cl = &_starpu_mpi_clear_cache_codelet;
+        task->handles[0] = data_handle;
+
+        _starpu_mpi_clear_cache_t *clear_cache = malloc(sizeof(_starpu_mpi_clear_cache_t));
+        clear_cache->data = data_handle;
+        clear_cache->rank = rank;
+        clear_cache->mode = mode;
+
+        task->callback_func = _starpu_mpi_clear_cache_callback;
+        task->callback_arg = clear_cache;
+        int ret = starpu_task_submit(task);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+

+ 26 - 0
mpi/starpu_mpi_insert_task_cache.h

@@ -0,0 +1,26 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define _STARPU_MPI_CLEAR_SENT_DATA     0
+#define _STARPU_MPI_CLEAR_RECEIVED_DATA 1
+
+extern struct starpu_htbl32_node **sent_data;
+extern struct starpu_htbl32_node **received_data;
+
+void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode);

+ 1 - 1
mpi/tests/insert_task.c

@@ -23,7 +23,7 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 
-        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
+        FPRINTF(stdout, "VALUES: %d %d\n", *x, *y);
         *x = (*x + *y) / 2;
         *x = (*x + *y) / 2;
 }
 }
 
 

+ 2 - 2
mpi/tests/insert_task_block.c

@@ -86,7 +86,7 @@ int main(int argc, char **argv)
         for(x = 0; x < SIZE; x++) {
         for(x = 0; x < SIZE; x++) {
                 FPRINTF(stdout, "[%d] ", rank);
                 FPRINTF(stdout, "[%d] ", rank);
                 for (y = 0; y < SIZE; y++) {
                 for (y = 0; y < SIZE; y++) {
-                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
+                        FPRINTF(stdout, "%3d ", matrix[x+y*SIZE]);
                 }
                 }
                 FPRINTF(stdout, "\n");
                 FPRINTF(stdout, "\n");
         }
         }
@@ -155,7 +155,7 @@ int main(int argc, char **argv)
 	{
 	{
                 FPRINTF(stdout, "[%d] ", rank);
                 FPRINTF(stdout, "[%d] ", rank);
                 for (y = 0; y < SIZE; y++) {
                 for (y = 0; y < SIZE; y++) {
-                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
+                        FPRINTF(stdout, "%3d ", matrix[x+y*SIZE]);
                 }
                 }
                 FPRINTF(stdout, "\n");
                 FPRINTF(stdout, "\n");
         }
         }

+ 3 - 10
mpi/tests/insert_task_cache.c

@@ -23,7 +23,7 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 
-        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
+        FPRINTF(stdout, "VALUES: %d %d\n", *x, *y);
         *x = (*x + *y) / 2;
         *x = (*x + *y) / 2;
 }
 }
 
 
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
                 FPRINTF(stdout, "[%d] ", rank);
                 FPRINTF(stdout, "[%d] ", rank);
                 for (y = 0; y < Y; y++)
                 for (y = 0; y < Y; y++)
 		{
 		{
-                        FPRINTF(stdout, "%3u ", matrix[x][y]);
+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
                 }
                 }
                 FPRINTF(stdout, "\n");
                 FPRINTF(stdout, "\n");
         }
         }
@@ -106,19 +106,12 @@ int main(int argc, char **argv)
                 }
                 }
         }
         }
 
 
-	mycodelet.name = "codelet1";
         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-
-	mycodelet.name = "codelet2";
         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-
-	mycodelet.name = "codelet3";
         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
-
-	mycodelet.name = "codelet4";
         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 
 
@@ -142,7 +135,7 @@ int main(int argc, char **argv)
                 FPRINTF(stdout, "[%d] ", rank);
                 FPRINTF(stdout, "[%d] ", rank);
                 for (y = 0; y < Y; y++)
                 for (y = 0; y < Y; y++)
 		{
 		{
-                        FPRINTF(stdout, "%3u ", matrix[x][y]);
+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
                 }
                 }
                 FPRINTF(stdout, "\n");
                 FPRINTF(stdout, "\n");
         }
         }

+ 2 - 1
mpi/tests/insert_task_owner2.c

@@ -15,6 +15,7 @@
  */
  */
 
 
 #include <starpu_mpi.h>
 #include <starpu_mpi.h>
+#include <starpu_mpi_datatype.h>
 #include <math.h>
 #include <math.h>
 #include "helper.h"
 #include "helper.h"
 
 
@@ -106,7 +107,7 @@ int main(int argc, char **argv)
                 starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
                 starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
 		if (rank == 0) {
 		if (rank == 0) {
 			starpu_data_acquire(data_handles[i], STARPU_R);
 			starpu_data_acquire(data_handles[i], STARPU_R);
-			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
+			values[i] = *((int *)starpu_mpi_handle_to_ptr(data_handles[i]));
 		}
 		}
         }
         }
         FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
         FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);

+ 2 - 2
mpi/tests/ring.c

@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 		if (loop == 0 && rank == 0)
 		if (loop == 0 && rank == 0)
 		{
 		{
 			token = 0;
 			token = 0;
-			FPRINTF(stdout, "Start with token value %u\n", token);
+			FPRINTF(stdout, "Start with token value %d\n", token);
 		}
 		}
 		else
 		else
 		{
 		{
@@ -106,7 +106,7 @@ int main(int argc, char **argv)
 		if (loop == last_loop && rank == last_rank)
 		if (loop == last_loop && rank == last_rank)
 		{
 		{
 			starpu_data_acquire(token_handle, STARPU_R);
 			starpu_data_acquire(token_handle, STARPU_R);
-			FPRINTF(stdout, "Finished : token value %u\n", token);
+			FPRINTF(stdout, "Finished : token value %d\n", token);
 			starpu_data_release(token_handle);
 			starpu_data_release(token_handle);
 		}
 		}
 		else
 		else

+ 2 - 2
mpi/tests/ring_async.c

@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 		if (loop == 0 && rank == 0)
 		if (loop == 0 && rank == 0)
 		{
 		{
 			token = 0;
 			token = 0;
-			FPRINTF(stdout, "Start with token value %u\n", token);
+			FPRINTF(stdout, "Start with token value %d\n", token);
 		}
 		}
 		else
 		else
 		{
 		{
@@ -108,7 +108,7 @@ int main(int argc, char **argv)
 		if (loop == last_loop && rank == last_rank)
 		if (loop == last_loop && rank == last_rank)
 		{
 		{
 			starpu_data_acquire(token_handle, STARPU_R);
 			starpu_data_acquire(token_handle, STARPU_R);
-			FPRINTF(stdout, "Finished : token value %u\n", token);
+			FPRINTF(stdout, "Finished : token value %d\n", token);
 			starpu_data_release(token_handle);
 			starpu_data_release(token_handle);
 		}
 		}
 		else {
 		else {

+ 3 - 3
mpi/tests/ring_async_implicit.c

@@ -95,7 +95,7 @@ int main(int argc, char **argv)
 		if (loop == 0 && rank == 0)
 		if (loop == 0 && rank == 0)
 		{
 		{
 			token = 0;
 			token = 0;
-			FPRINTF(stdout, "Start with token value %u\n", token);
+			FPRINTF(stdout, "Start with token value %d\n", token);
 		}
 		}
 		else
 		else
 		{
 		{
@@ -107,7 +107,7 @@ int main(int argc, char **argv)
 		if (loop == last_loop && rank == last_rank)
 		if (loop == last_loop && rank == last_rank)
 		{
 		{
 			starpu_data_acquire(token_handle, STARPU_R);
 			starpu_data_acquire(token_handle, STARPU_R);
-			FPRINTF(stdout, "Finished : token value %u\n", token);
+			FPRINTF(stdout, "Finished : token value %d\n", token);
 			starpu_data_release(token_handle);
 			starpu_data_release(token_handle);
 		}
 		}
 		else
 		else
@@ -125,7 +125,7 @@ int main(int argc, char **argv)
 
 
 	if (rank == last_rank)
 	if (rank == last_rank)
 	{
 	{
-                FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
+                FPRINTF(stderr, "[%d] token = %d == %d * %d ?\n", rank, token, nloops, size);
                 STARPU_ASSERT(token == nloops*size);
                 STARPU_ASSERT(token == nloops*size);
 	}
 	}