Andra Hugo 12 years ago
parent
commit
1967c51c43
71 changed files with 9388 additions and 0 deletions
  1. 1 0
      mpi/.gitignore
  2. 29 0
      mpi/Makefile.am
  3. 206 0
      mpi/examples/Makefile.am
  4. 251 0
      mpi/examples/cholesky/mpi_cholesky.c
  5. 106 0
      mpi/examples/cholesky/mpi_cholesky.h
  6. 174 0
      mpi/examples/cholesky/mpi_cholesky_codelets.c
  7. 27 0
      mpi/examples/cholesky/mpi_cholesky_codelets.h
  8. 117 0
      mpi/examples/cholesky/mpi_cholesky_distributed.c
  9. 236 0
      mpi/examples/cholesky/mpi_cholesky_kernels.c
  10. 40 0
      mpi/examples/cholesky/mpi_cholesky_models.c
  11. 27 0
      mpi/examples/cholesky/mpi_cholesky_models.h
  12. 75 0
      mpi/examples/complex/mpi_complex.c
  13. 42 0
      mpi/examples/mpi_lu/mpi_lu-double.h
  14. 42 0
      mpi/examples/mpi_lu/mpi_lu-float.h
  15. 19 0
      mpi/examples/mpi_lu/pdlu.c
  16. 19 0
      mpi/examples/mpi_lu/pdlu_kernels.c
  17. 577 0
      mpi/examples/mpi_lu/plu_example.c
  18. 19 0
      mpi/examples/mpi_lu/plu_example_double.c
  19. 19 0
      mpi/examples/mpi_lu/plu_example_float.c
  20. 394 0
      mpi/examples/mpi_lu/plu_solve.c
  21. 19 0
      mpi/examples/mpi_lu/plu_solve_double.c
  22. 19 0
      mpi/examples/mpi_lu/plu_solve_float.c
  23. 19 0
      mpi/examples/mpi_lu/pslu.c
  24. 19 0
      mpi/examples/mpi_lu/pslu_kernels.c
  25. 870 0
      mpi/examples/mpi_lu/pxlu.c
  26. 65 0
      mpi/examples/mpi_lu/pxlu.h
  27. 444 0
      mpi/examples/mpi_lu/pxlu_kernels.c
  28. 32 0
      mpi/examples/mpi_lu/pxlu_kernels.h
  29. 19 0
      mpi/examples/mpi_lu/slu_kernels.c
  30. 106 0
      mpi/examples/perf.sh
  31. 156 0
      mpi/examples/reduction/mpi_reduction.c
  32. 66 0
      mpi/examples/reduction/mpi_reduction_kernels.c
  33. 228 0
      mpi/examples/scatter_gather/mpi_scatter_gather.c
  34. 159 0
      mpi/examples/stencil/stencil5.c
  35. 70 0
      mpi/include/starpu_mpi.h
  36. 29 0
      mpi/libstarpumpi.pc.in
  37. 51 0
      mpi/src/Makefile.am
  38. 867 0
      mpi/src/starpu_mpi.c
  39. 78 0
      mpi/src/starpu_mpi_collective.c
  40. 149 0
      mpi/src/starpu_mpi_datatype.c
  41. 33 0
      mpi/src/starpu_mpi_datatype.h
  42. 45 0
      mpi/src/starpu_mpi_fxt.h
  43. 104 0
      mpi/src/starpu_mpi_helper.c
  44. 632 0
      mpi/src/starpu_mpi_insert_task.c
  45. 99 0
      mpi/src/starpu_mpi_private.h
  46. 88 0
      mpi/src/starpu_mpi_stats.c
  47. 24 0
      mpi/src/starpu_mpi_stats.h
  48. 29 0
      mpi/starpumpi-1.0.pc.in
  49. 1 0
      mpi/tests/.gitignore
  50. 153 0
      mpi/tests/Makefile.am
  51. 148 0
      mpi/tests/block_interface.c
  52. 151 0
      mpi/tests/block_interface_pinned.c
  53. 22 0
      mpi/tests/helper.h
  54. 143 0
      mpi/tests/insert_task.c
  55. 165 0
      mpi/tests/insert_task_block.c
  56. 152 0
      mpi/tests/insert_task_cache.c
  57. 180 0
      mpi/tests/insert_task_owner.c
  58. 120 0
      mpi/tests/insert_task_owner2.c
  59. 99 0
      mpi/tests/insert_task_owner_data.c
  60. 80 0
      mpi/tests/mpi_detached_tag.c
  61. 79 0
      mpi/tests/mpi_irecv.c
  62. 97 0
      mpi/tests/mpi_irecv_detached.c
  63. 80 0
      mpi/tests/mpi_isend.c
  64. 98 0
      mpi/tests/mpi_isend_detached.c
  65. 86 0
      mpi/tests/mpi_test.c
  66. 92 0
      mpi/tests/multiple_send.c
  67. 76 0
      mpi/tests/pingpong.c
  68. 129 0
      mpi/tests/ring.c
  69. 133 0
      mpi/tests/ring_async.c
  70. 133 0
      mpi/tests/ring_async_implicit.c
  71. 32 0
      mpi/tests/ring_kernel.cu

+ 1 - 0
mpi/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 29 - 0
mpi/Makefile.am

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+SUBDIRS=src tests examples
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc
+
+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
+versinclude_HEADERS = 					\
+	include/starpu_mpi.h
+
+showcheck:
+	for i in $(SUBDIRS) ; do \
+		make -C $$i showcheck ; \
+	done

+ 206 - 0
mpi/examples/Makefile.am

@@ -0,0 +1,206 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+CC=$(MPICC)
+CCLD=$(MPICC)
+
+if STARPU_MPI_CHECK
+if STARPU_HAVE_AM111
+LOG_COMPILER	 	=	$(MPIEXEC) -np 2
+else
+TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 2
+endif
+TESTS			=	$(check_PROGRAMS)
+endif
+
+check_PROGRAMS =
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo
+
+EXTRA_DIST = 					\
+	mpi_lu/mpi_lu-float.h		\
+	mpi_lu/mpi_lu-double.h		\
+	mpi_lu/plu_example.c		\
+	mpi_lu/plu_solve.c		\
+	mpi_lu/pxlu.h			\
+	mpi_lu/pxlu.c			\
+	mpi_lu/pxlu_kernels.h		\
+	mpi_lu/pxlu_kernels.c		\
+	cholesky/mpi_cholesky.h	\
+	cholesky/mpi_cholesky_models.h \
+	cholesky/mpi_cholesky_codelets.h \
+	../tests/helper.h
+
+examplebindir = $(libdir)/starpu/mpi
+
+examplebin_PROGRAMS =
+
+if STARPU_USE_CUDA
+# TODO define NVCCFLAGS
+NVCC ?= nvcc
+
+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
+
+.cu.cubin:
+	$(MKDIR_P) `dirname $@`
+	$(NVCC) -cubin $< -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
+
+.cu.o:
+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/  -I$(top_builddir)/include/
+endif
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
+
+###################
+# Stencil example #
+###################
+if BUILD_EXAMPLES
+examplebin_PROGRAMS +=				\
+	stencil/stencil5
+
+stencil_stencil5_LDADD =		\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+check_PROGRAMS	+=	\
+	stencil/stencil5
+
+##################
+# MPI LU example #
+##################
+
+if !NO_BLAS_LIB
+
+examplebin_PROGRAMS += 			\
+	mpi_lu/plu_example_float	\
+	mpi_lu/plu_example_double
+
+mpi_lu_plu_example_float_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS)
+
+mpi_lu_plu_example_float_SOURCES =	\
+	mpi_lu/plu_example_float.c	\
+	mpi_lu/plu_solve_float.c	\
+	mpi_lu/pslu_kernels.c		\
+	mpi_lu/pslu.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_example_double_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS)
+
+mpi_lu_plu_example_double_SOURCES =	\
+	mpi_lu/plu_example_double.c	\
+	mpi_lu/plu_solve_double.c  	\
+	mpi_lu/pdlu_kernels.c	    	\
+	mpi_lu/pdlu.c		    	\
+	$(top_srcdir)/examples/common/blas.c
+endif
+
+########################
+# MPI Cholesky example #
+########################
+
+if !NO_BLAS_LIB
+examplebin_PROGRAMS +=		\
+	cholesky/mpi_cholesky			\
+	cholesky/mpi_cholesky_distributed
+
+cholesky_mpi_cholesky_SOURCES	=		\
+	cholesky/mpi_cholesky.c		\
+	cholesky/mpi_cholesky_models.c		\
+	cholesky/mpi_cholesky_kernels.c	\
+	cholesky/mpi_cholesky_codelets.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+cholesky_mpi_cholesky_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_BLAS_LDFLAGS)
+
+cholesky_mpi_cholesky_distributed_SOURCES =	\
+	cholesky/mpi_cholesky_distributed.c	\
+	cholesky/mpi_cholesky_models.c		\
+	cholesky/mpi_cholesky_kernels.c	\
+	cholesky/mpi_cholesky_codelets.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+cholesky_mpi_cholesky_distributed_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_BLAS_LDFLAGS)
+
+check_PROGRAMS +=					\
+	cholesky/mpi_cholesky			\
+	cholesky/mpi_cholesky_distributed
+endif
+
+########################
+# Scatter Gather       #
+########################
+
+examplebin_PROGRAMS +=		\
+	scatter_gather/mpi_scatter_gather
+
+scatter_gather_mpi_scatter_gather_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+check_PROGRAMS +=		\
+	scatter_gather/mpi_scatter_gather
+
+###################
+# Reduction       #
+###################
+
+examplebin_PROGRAMS +=		\
+	reduction/mpi_reduction
+
+reduction_mpi_reduction_SOURCES =		\
+	reduction/mpi_reduction.c		\
+	reduction/mpi_reduction_kernels.c
+
+reduction_mpi_reduction_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+check_PROGRAMS +=		\
+	reduction/mpi_reduction
+
+###################
+# complex example #
+###################
+
+examplebin_PROGRAMS +=				\
+	complex/mpi_complex
+
+complex_mpi_complex_SOURCES =		\
+	complex/mpi_complex.c		\
+	$(top_srcdir)/examples/interface/complex_interface.c
+
+complex_mpi_complex_LDADD =		\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+check_PROGRAMS	+=	\
+	complex/mpi_complex
+endif
+
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 251 - 0
mpi/examples/cholesky/mpi_cholesky.c

@@ -0,0 +1,251 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "mpi_cholesky.h"
+#include "mpi_cholesky_models.h"
+#include "mpi_cholesky_codelets.h"
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+	//return (x+y) % nb_nodes;
+	return (x%dblockx)+(y%dblocky)*dblockx;
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	float ***bmat;
+	int rank, nodes, ret;
+
+	parse_args(argc, argv);
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_mpi_initialize_extended(&rank, &nodes);
+	starpu_helper_cublas_init();
+
+	if (dblockx == -1 || dblocky == -1)
+	{
+	     int factor;
+	     dblockx = nodes;
+	     dblocky = 1;
+	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
+	     {
+		  if (nodes % factor == 0)
+		  {
+		       dblockx = nodes/factor;
+		       dblocky = factor;
+		       break;
+		  }
+	     }
+	}
+
+	unsigned i,j,x,y;
+	bmat = malloc(nblocks * sizeof(float *));
+	for(x=0 ; x<nblocks ; x++)
+	{
+		bmat[x] = malloc(nblocks * sizeof(float *));
+		for(y=0 ; y<nblocks ; y++)
+		{
+			starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+			for (i = 0; i < BLOCKSIZE; i++)
+			{
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+					bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
+					//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+				}
+			}
+		}
+	}
+
+
+	if (display)
+	{
+		printf("[%d] Input :\n", rank);
+
+		for(y=0 ; y<nblocks ; y++)
+		{
+			for(x=0 ; x<nblocks ; x++)
+			{
+				printf("Block %u,%u :\n", x, y);
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+					for (i = 0; i < BLOCKSIZE; i++)
+					{
+						if (i <= j)
+						{
+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
+						}
+						else
+						{
+							printf(".\t");
+						}
+					}
+					printf("\n");
+				}
+			}
+		}
+	}
+
+	double timing, flops;
+	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
+
+	starpu_mpi_shutdown();
+
+	if (display)
+	{
+		printf("[%d] Results :\n", rank);
+		for(y=0 ; y<nblocks ; y++)
+		{
+			for(x=0 ; x<nblocks ; x++)
+			{
+				printf("Block %u,%u :\n", x, y);
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+					for (i = 0; i < BLOCKSIZE; i++)
+					{
+						if (i <= j)
+						{
+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
+						}
+						else
+						{
+							printf(".\t");
+						}
+					}
+					printf("\n");
+				}
+			}
+		}
+	}
+
+	float *rmat = malloc(size*size*sizeof(float));
+	for(x=0 ; x<nblocks ; x++)
+	{
+		for(y=0 ; y<nblocks ; y++)
+		{
+			for (i = 0; i < BLOCKSIZE; i++)
+			{
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i > j)
+			{
+				rmat[j+i*size] = 0.0f; // debug
+			}
+		}
+	}
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f,
+			rmat, size, 0.0f, test_mat, size);
+
+	fprintf(stderr, "[%d] comparing results ...\n", rank);
+	if (display)
+	{
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < size; i++)
+			{
+				if (i <= j)
+				{
+					printf("%2.2f\t", test_mat[j +i*size]);
+				}
+				else
+				{
+					printf(".\t");
+				}
+			}
+			printf("\n");
+		}
+	}
+
+	int correctness = 1;
+	for(x = 0; x < nblocks ;  x++)
+	{
+		for (y = 0; y < nblocks; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (mpi_rank == rank)
+			{
+				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
+				{
+					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
+					{
+						if (i <= j)
+						{
+							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+							float err = abs(test_mat[j +i*size] - orig);
+							if (err > 0.00001)
+							{
+								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+								correctness = 0;
+								flops = 0;
+								break;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+	for(x=0 ; x<nblocks ; x++)
+	{
+		for(y=0 ; y<nblocks ; y++)
+		{
+			starpu_free((void *)bmat[x][y]);
+		}
+		free(bmat[x]);
+	}
+	free(bmat);
+	free(rmat);
+	free(test_mat);
+
+	starpu_helper_cublas_shutdown();
+	starpu_shutdown();
+
+	assert(correctness);
+
+	if (rank == 0)
+	{
+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
+	}
+
+	return 0;
+}

+ 106 - 0
mpi/examples/cholesky/mpi_cholesky.h

@@ -0,0 +1,106 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MPI_CHOLESKY_H__
+#define __MPI_CHOLESKY_H__
+
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#endif
+
+#include <common/blas.h>
+#include <starpu.h>
+
+#define BLOCKSIZE	(size/nblocks)
+
+static unsigned size = 4*1024;
+static unsigned nblocks = 16;
+static unsigned nbigblocks = 2;
+static unsigned noprio = 0;
+static unsigned display = 0;
+static unsigned dblockx = -1;
+static unsigned dblocky = -1;
+
+void chol_cpu_codelet_update_u11(void **, void *);
+void chol_cpu_codelet_update_u21(void **, void *);
+void chol_cpu_codelet_update_u22(void **, void *);
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
+#endif
+
+static void __attribute__((unused)) parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-size") == 0)
+		{
+		        char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-dblockx") == 0)
+		{
+		        char *argptr;
+			dblockx = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-dblocky") == 0)
+		{
+		        char *argptr;
+			dblocky = strtol(argv[++i], &argptr, 10);
+		}
+	
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+		        char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nbigblocks") == 0)
+		{
+		        char *argptr;
+			nbigblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-no-prio") == 0)
+		{
+			noprio = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0)
+		{
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-h") == 0)
+		{
+			printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
+		}
+	}
+	if (nblocks > size) nblocks = size;
+}
+
+#endif // __MPI_CHOLESKY_H__

+ 174 - 0
mpi/examples/cholesky/mpi_cholesky_codelets.c

@@ -0,0 +1,174 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "mpi_cholesky.h"
+#include "mpi_cholesky_models.h"
+#include "mpi_cholesky_codelets.h"
+
+/*
+ *	Create the codelets
+ */
+
+static struct starpu_codelet cl11 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &chol_model_11
+};
+
+static struct starpu_codelet cl21 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &chol_model_21
+};
+
+static struct starpu_codelet cl22 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#endif
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &chol_model_22
+};
+
+extern int my_distrib(int x, int y, int nb_nodes);
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes, double *timing, double *flops)
+{
+	struct timeval start;
+	struct timeval end;
+	starpu_data_handle_t **data_handles;
+	int x, y;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
+	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
+
+	for(x = 0; x < nblocks ;  x++)
+	{
+		for (y = 0; y < nblocks; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (mpi_rank == rank)
+			{
+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
+						ld, size/nblocks, size/nblocks, sizeof(float));
+			}
+			/* TODO: make better test to only registering what is needed */
+			else
+			{
+				/* I don't own that index, but will need it for my computations */
+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
+						ld, size/nblocks, size/nblocks, sizeof(float));
+			}
+			if (data_handles[x][y])
+			{
+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
+				starpu_data_set_tag(data_handles[x][y], (y*nblocks)+x);
+			}
+		}
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	gettimeofday(&start, NULL);
+
+	for (k = 0; k < nblocks; k++)
+	{
+		int prio = STARPU_DEFAULT_PRIO;
+		if (!noprio) prio = STARPU_MAX_PRIO;
+
+		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
+				STARPU_PRIORITY, prio,
+				STARPU_RW, data_handles[k][k],
+				0);
+
+		for (j = k+1; j<nblocks; j++)
+		{
+			prio = STARPU_DEFAULT_PRIO;
+			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
+			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
+					STARPU_PRIORITY, prio,
+					STARPU_R, data_handles[k][k],
+					STARPU_RW, data_handles[k][j],
+					0);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+				{
+					prio = STARPU_DEFAULT_PRIO;
+					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
+					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
+							STARPU_PRIORITY, prio,
+							STARPU_R, data_handles[k][i],
+							STARPU_R, data_handles[k][j],
+							STARPU_RW, data_handles[i][j],
+							0);
+				}
+			}
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+	for(x = 0; x < nblocks ;  x++)
+	{
+		for (y = 0; y < nblocks; y++)
+		{
+			if (data_handles[x][y])
+				starpu_data_unregister(data_handles[x][y]);
+		}
+		free(data_handles[x]);
+	}
+	free(data_handles);
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	gettimeofday(&end, NULL);
+
+	if (rank == 0)
+	{
+		double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+
+		double flop = (1.0f*size*size*size)/3.0f;
+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
+}
+

+ 27 - 0
mpi/examples/cholesky/mpi_cholesky_codelets.h

@@ -0,0 +1,27 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MPI_CHOLESKY_CODELETS_H__
+#define __MPI_CHOLESKY_CODELETS_H__
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes, double *timing, double *flops);
+
+#endif /* __MPI_CHOLESKY_CODELETS_H__ */

+ 117 - 0
mpi/examples/cholesky/mpi_cholesky_distributed.c

@@ -0,0 +1,117 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "mpi_cholesky.h"
+#include "mpi_cholesky_models.h"
+#include "mpi_cholesky_codelets.h"
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+	//return (x+y) % nb_nodes;
+	return (x%dblockx)+(y%dblocky)*dblockx;
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	float ***bmat;
+	int rank, nodes, ret;
+
+	parse_args(argc, argv);
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_mpi_initialize_extended(&rank, &nodes);
+	starpu_helper_cublas_init();
+
+	if (dblockx == -1 || dblocky == -1)
+	{
+	     int factor;
+	     dblockx = nodes;
+	     dblocky = 1;
+	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
+	     {
+		  if (nodes % factor == 0)
+		  {
+		       dblockx = nodes/factor;
+		       dblocky = factor;
+		       break;
+		  }
+	     }
+	}
+
+	unsigned i,j,x,y;
+	bmat = malloc(nblocks * sizeof(float *));
+	for(x=0 ; x<nblocks ; x++)
+	{
+		bmat[x] = malloc(nblocks * sizeof(float *));
+		for(y=0 ; y<nblocks ; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (mpi_rank == rank)
+			{
+				starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+				for (i = 0; i < BLOCKSIZE; i++)
+				{
+					for (j = 0; j < BLOCKSIZE; j++)
+					{
+						bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
+						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+					}
+				}
+			}
+		}
+	}
+
+	double timing, flops;
+	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
+
+	starpu_mpi_shutdown();
+
+	if (rank == 0)
+	{
+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
+	}
+
+
+	for(x=0 ; x<nblocks ; x++)
+	{
+		for(y=0 ; y<nblocks ; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (mpi_rank == rank)
+			{
+				starpu_free((void *)bmat[x][y]);
+			}
+		}
+		free(bmat[x]);
+	}
+	free(bmat);
+
+	starpu_helper_cublas_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 236 - 0
mpi/examples/cholesky/mpi_cholesky_kernels.c

@@ -0,0 +1,236 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "mpi_cholesky.h"
+#include "common/blas.h"
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#ifdef STARPU_HAVE_MAGMA
+#include "magma.h"
+#include "magma_lapack.h"
+#endif
+#endif
+
+/*
+ *   U22
+ */
+
+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+	//printf("22\n");
+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
+
+#ifdef STARPU_USE_CUDA
+	cublasStatus st;
+#endif
+
+	switch (s)
+	{
+		case 0:
+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
+				right, ld12, 1.0f, center, ld22);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			cublasSgemm('n', 't', dy, dx, dz,
+					-1.0f, left, ld21, right, ld12,
+					 1.0f, center, ld22);
+			st = cublasGetError();
+			if (STARPU_UNLIKELY(st != CUBLAS_STATUS_SUCCESS))
+				STARPU_CUBLAS_REPORT_ERROR(st);
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA
+
+/*
+ * U21
+ */
+
+static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+//	printf("21\n");
+	float *sub11;
+	float *sub21;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
+
+	switch (s)
+	{
+		case 0:
+			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
+{
+	 chol_common_codelet_update_u21(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u21(descr, 1, _args);
+}
+#endif
+
+/*
+ *	U11
+ */
+
+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+//	printf("11\n");
+	float *sub11;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+
+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
+
+	unsigned z;
+
+	switch (s)
+	{
+		case 0:
+
+			/*
+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
+			 *	- A22 <- A22 - l21 trans(l21)
+			 */
+
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				lambda11 = sqrt(sub11[z+z*ld]);
+				sub11[z+z*ld] = lambda11;
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+
+				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+
+				SSYR("L", nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+#ifdef STARPU_HAVE_MAGMA
+			{
+				int ret;
+				int info;
+				ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
+				if (ret != MAGMA_SUCCESS)
+				{
+					fprintf(stderr, "Error in Magma: %d\n", ret);
+					STARPU_ABORT();
+				}
+				cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				STARPU_ASSERT(!cures);
+			}
+#else
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+
+				lambda11 = sqrt(lambda11);
+
+				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
+
+				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+
+				cublasSsyr('U', nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+#endif
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+
+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA

+ 40 - 0
mpi/examples/cholesky/mpi_cholesky_models.c

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_cholesky_models.h"
+
+/*
+ *	Number of flops of Gemm
+ */
+
+struct starpu_perfmodel chol_model_11 =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_11"
+};
+
+struct starpu_perfmodel chol_model_21 =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_21"
+};
+
+struct starpu_perfmodel chol_model_22 =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_22"
+};

+ 27 - 0
mpi/examples/cholesky/mpi_cholesky_models.h

@@ -0,0 +1,27 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_MODELS_H__
+#define __DW_CHOLESKY_MODELS_H__
+
+#include <starpu.h>
+
+extern struct starpu_perfmodel chol_model_11;
+extern struct starpu_perfmodel chol_model_21;
+extern struct starpu_perfmodel chol_model_22;
+
+#endif // __DW_CHOLESKY_MODELS_H__

+ 75 - 0
mpi/examples/complex/mpi_complex.c

@@ -0,0 +1,75 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <interface/complex_interface.h>
+#include <interface/complex_codelet.h>
+
+int main(int argc, char **argv)
+{
+	int rank, nodes;
+	int ret;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_mpi_initialize_extended(&rank, &nodes);
+
+	if (nodes < 2)
+	{
+		fprintf(stderr, "This program needs at least 2 nodes\n");
+		ret = 77;
+	}
+	else
+	{
+		if (rank == 0)
+		{
+			double real[2] = {4.0, 2.0};
+			double imaginary[2] = {7.0, 9.0};
+			starpu_data_handle_t handle;
+
+			double real2[2] = {14.0, 12.0};
+			double imaginary2[2] = {17.0, 19.0};
+			starpu_data_handle_t handle2;
+			MPI_Status status;
+
+			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
+			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
+			starpu_mpi_send(handle, 1, 10, MPI_COMM_WORLD);
+
+			starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
+			starpu_mpi_recv(handle2, 1, 11, MPI_COMM_WORLD, &status);
+			starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
+			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, 0);
+		}
+		else if (rank == 1)
+		{
+			double real[2] = {0.0, 0.0};
+			double imaginary[2] = {0.0, 0.0};
+			starpu_data_handle_t handle;
+			MPI_Status status;
+
+			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
+			starpu_mpi_recv(handle, 0, 10, MPI_COMM_WORLD, &status);
+			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
+			starpu_mpi_send(handle, 0, 11, MPI_COMM_WORLD);
+		}
+	}
+	starpu_task_wait_for_all();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return ret;
+}

+ 42 - 0
mpi/examples/mpi_lu/mpi_lu-double.h

@@ -0,0 +1,42 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE double
+#define MPI_TYPE	MPI_DOUBLE
+
+#define STARPU_PLU(name)       starpu_pdlu_##name
+
+#define CUBLAS_GEMM	cublasDgemm
+#define CUBLAS_TRSM	cublasDtrsm
+#define CUBLAS_SCAL	cublasDscal
+#define CUBLAS_GER	cublasDger
+#define CUBLAS_SWAP	cublasDswap
+#define CUBLAS_IAMAX	cublasIdamax
+
+#define CPU_GEMM	DGEMM
+#define CPU_GEMV	DGEMV
+#define CPU_TRSM	DTRSM
+#define CPU_SCAL	DSCAL
+#define CPU_GER		DGER
+#define CPU_SWAP	DSWAP
+
+#define CPU_TRMM	DTRMM
+#define CPU_AXPY	DAXPY
+#define CPU_ASUM	DASUM
+#define CPU_IAMAX	IDAMAX
+
+#define PIVOT_THRESHHOLD	10e-10

+ 42 - 0
mpi/examples/mpi_lu/mpi_lu-float.h

@@ -0,0 +1,42 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE float
+#define MPI_TYPE	MPI_FLOAT
+
+#define STARPU_PLU(name)       starpu_pslu_##name
+
+#define CUBLAS_GEMM	cublasSgemm
+#define CUBLAS_TRSM	cublasStrsm
+#define CUBLAS_SCAL	cublasSscal
+#define CUBLAS_GER	cublasSger
+#define CUBLAS_SWAP	cublasSswap
+#define CUBLAS_IAMAX	cublasIsamax
+
+#define CPU_GEMM	SGEMM
+#define CPU_GEMV	SGEMV
+#define CPU_TRSM	STRSM
+#define CPU_SCAL	SSCAL
+#define CPU_GER		SGER
+#define CPU_SWAP	SSWAP
+
+#define CPU_TRMM	STRMM
+#define CPU_AXPY	SAXPY
+#define CPU_ASUM	SASUM
+#define CPU_IAMAX	ISAMAX
+
+#define PIVOT_THRESHHOLD	10e-5

+ 19 - 0
mpi/examples/mpi_lu/pdlu.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "pxlu.c"

+ 19 - 0
mpi/examples/mpi_lu/pdlu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "pxlu_kernels.c"

+ 577 - 0
mpi/examples/mpi_lu/plu_example.c

@@ -0,0 +1,577 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+#ifdef STARPU_HAVE_LIBNUMA
+#include <numaif.h>
+#endif
+
+static unsigned long size = 16384;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static unsigned p = 1;
+static unsigned q = 1;
+static unsigned display = 0;
+
+#ifdef STARPU_HAVE_LIBNUMA
+static unsigned numa = 0;
+#endif
+
+static size_t allocated_memory = 0;
+static size_t allocated_memory_extra = 0;
+
+static starpu_data_handle_t *dataA_handles;
+static TYPE **dataA;
+
+/* In order to implement the distributed LU decomposition, we allocate
+ * temporary buffers */
+#ifdef SINGLE_TMP11
+static starpu_data_handle_t tmp_11_block_handle;
+static TYPE *tmp_11_block;
+#else
+static starpu_data_handle_t *tmp_11_block_handles;
+static TYPE **tmp_11_block;
+#endif
+#ifdef SINGLE_TMP1221
+static starpu_data_handle_t *tmp_12_block_handles;
+static TYPE **tmp_12_block;
+static starpu_data_handle_t *tmp_21_block_handles;
+static TYPE **tmp_21_block;
+#else
+static starpu_data_handle_t *(tmp_12_block_handles[2]);
+static TYPE **(tmp_12_block[2]);
+static starpu_data_handle_t *(tmp_21_block_handles[2]);
+static TYPE **(tmp_21_block[2]);
+#endif
+
+int get_block_rank(unsigned i, unsigned j);
+
+static void parse_args(int rank, int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-numa") == 0) {
+#ifdef STARPU_HAVE_LIBNUMA
+			numa = 1;
+#else
+			if (rank == 0)
+				fprintf(stderr, "Warning: libnuma is not available\n");
+#endif
+		}
+
+		if (strcmp(argv[i], "-p") == 0) {
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0) {
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+	}
+}
+
+unsigned STARPU_PLU(display_flag)(void)
+{
+	return display;
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nblocks)
+{
+	const unsigned block_size = (size/nblocks);
+
+	unsigned i, j;
+	for (i = 0; i < block_size; i++)
+	for (j = 0; j < block_size; j++)
+	{
+		blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	}
+}
+
+#ifdef SINGLE_TMP11
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void)
+{
+	return tmp_11_block_handle;
+}
+#else
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k)
+{
+	return tmp_11_block_handles[k];
+}
+#endif
+
+#ifdef SINGLE_TMP1221
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
+{
+	return tmp_12_block_handles[j];
+}
+
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
+{
+	return tmp_21_block_handles[i];
+}
+#else
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k)
+{
+	return tmp_12_block_handles[k%2][j];
+}
+
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k)
+{
+	return tmp_21_block_handles[k%2][i];
+}
+#endif
+
+static unsigned tmp_11_block_is_needed(int rank, unsigned nblocks, unsigned k)
+{
+	return 1;
+}
+
+static unsigned tmp_12_block_is_needed(int rank, unsigned nblocks, unsigned j)
+{
+	unsigned i;
+	for (i = 1; i < nblocks; i++)
+	{
+		if (get_block_rank(i, j) == rank)
+			return 1;
+	}
+
+	return 0;
+}
+
+static unsigned tmp_21_block_is_needed(int rank, unsigned nblocks, unsigned i)
+{
+	unsigned j;
+	for (j = 1; j < nblocks; j++)
+	{
+		if (get_block_rank(i, j) == rank)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void init_matrix(int rank)
+{
+#ifdef STARPU_HAVE_LIBNUMA
+	if (numa)
+	{
+		fprintf(stderr, "Using INTERLEAVE policy\n");
+		unsigned long nodemask = ((1<<0)|(1<<1));
+		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
+		if (ret)
+			perror("set_mempolicy failed");
+	}
+#endif
+
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
+	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			TYPE **blockptr = &dataA[j+i*nblocks];
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+
+			if (get_block_rank(i, j) == rank)
+			{
+				/* This blocks should be treated by the current MPI process */
+				/* Allocate and fill it */
+				starpu_malloc((void **)blockptr, blocksize);
+				allocated_memory += blocksize;
+
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				fill_block_with_random(*blockptr, size, nblocks);
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				if (i == j)
+				{
+					unsigned tmp;
+					for (tmp = 0; tmp < size/nblocks; tmp++)
+					{
+						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+					}
+				}
+
+				/* Register it to StarPU */
+				starpu_matrix_data_register(handleptr, 0,
+					(uintptr_t)*blockptr, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			else {
+				*blockptr = STARPU_POISON_PTR;
+				*handleptr = STARPU_POISON_PTR;
+			}
+		}
+	}
+
+	/* Allocate the temporary buffers required for the distributed algorithm */
+
+	unsigned k;
+
+	/* tmp buffer 11 */
+#ifdef SINGLE_TMP11
+	starpu_malloc((void **)&tmp_11_block, blocksize);
+	allocated_memory_extra += blocksize;
+	starpu_matrix_data_register(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+#else
+	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
+	tmp_11_block = calloc(nblocks, sizeof(TYPE *));
+	allocated_memory_extra += nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+
+	for (k = 0; k < nblocks; k++)
+	{
+		if (tmp_11_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_11_block[k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_11_block[k]);
+
+			starpu_matrix_data_register(&tmp_11_block_handles[k], 0,
+				(uintptr_t)tmp_11_block[k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+	}
+#endif
+
+	/* tmp buffers 12 and 21 */
+#ifdef SINGLE_TMP1221
+	tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
+	tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
+	tmp_12_block = calloc(nblocks, sizeof(TYPE *));
+	tmp_21_block = calloc(nblocks, sizeof(TYPE *));
+
+	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+#else
+	for (i = 0; i < 2; i++) {
+		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
+		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
+		tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *));
+		tmp_21_block[i] = calloc(nblocks, sizeof(TYPE *));
+
+		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+	}
+#endif
+	
+	for (k = 0; k < nblocks; k++)
+	{
+#ifdef SINGLE_TMP1221
+		if (tmp_12_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_12_block[k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_12_block[k]);
+
+			starpu_matrix_data_register(&tmp_12_block_handles[k], 0,
+				(uintptr_t)tmp_12_block[k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+
+		if (tmp_21_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_21_block[k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_21_block[k]);
+
+			starpu_matrix_data_register(&tmp_21_block_handles[k], 0,
+				(uintptr_t)tmp_21_block[k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+#else
+	for (i = 0; i < 2; i++) {
+		if (tmp_12_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_12_block[i][k]);
+	
+			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
+				(uintptr_t)tmp_12_block[i][k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+
+		if (tmp_21_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_21_block[i][k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_21_block[i][k]);
+	
+			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
+				(uintptr_t)tmp_21_block[i][k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+	}
+#endif
+	}
+
+	//display_all_blocks(nblocks, size/nblocks);
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	return dataA[j+i*nblocks];
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
+static void display_grid(int rank, unsigned nblocks)
+{
+	if (!display)
+		return;
+
+	//if (rank == 0)
+	{
+		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
+		
+		unsigned i, j;
+		for (j = 0; j < nblocks; j++)
+		{
+			for (i = 0; i < nblocks; i++)
+			{
+				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
+				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
+
+				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
+			}
+			fprintf(stderr, "\n");
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+
+#if 0
+	/*
+	 *	Initialization
+	 */
+	int thread_support;
+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
+		fprintf(stderr,"MPI_Init_thread failed\n");
+		exit(1);
+	}
+	if (thread_support == MPI_THREAD_FUNNELED)
+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+	if (thread_support < MPI_THREAD_FUNNELED)
+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
+	
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+#endif
+
+	starpu_srand48((long int)time(NULL));
+
+	parse_args(rank, argc, argv);
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* We disable sequential consistency in this example */
+	starpu_data_set_default_sequential_consistency_flag(0);
+
+	starpu_mpi_initialize_extended(&rank, &world_size);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_helper_cublas_init();
+
+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	/*
+	 * 	Problem Init
+	 */
+
+	init_matrix(rank);
+
+	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
+                        (int)allocated_memory/(1024*1024),
+			(int)allocated_memory_extra/(1024*1024),
+                        (int)(allocated_memory+allocated_memory_extra)/(1024*1024));
+
+	display_grid(rank, nblocks);
+
+	TYPE *a_r = NULL;
+//	STARPU_PLU(display_data_content)(a_r, size);
+
+	TYPE *x, *y;
+
+	if (check)
+	{
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			unsigned ind;
+			for (ind = 0; ind < size; ind++)
+				x[ind] = (TYPE)starpu_drand48();
+		}
+
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+	}
+
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Report performance
+	 */
+
+	int reduce_ret;
+	double min_timing = timing;
+	double max_timing = timing;
+	double sum_timing = timing;
+
+	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	if (rank == 0)
+	{
+		fprintf(stderr, "Computation took: %f ms\n", max_timing/1000);
+		fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000);
+		fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000);
+		fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
+
+		unsigned n = size;
+		double flop = (2.0f*n*n*n)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f));
+	}
+
+	/*
+	 *	Test Result Correctness
+	 */
+
+	if (check)
+	{
+		/*
+		 *	Compute || A - LU ||
+		 */
+
+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
+
+#if 0
+		/*
+		 *	Compute || Ax - LUx ||
+		 */
+
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+		
+		if (rank == 0)
+		{
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		/* Compute y2 = y2 - y */
+	        CPU_AXPY(size, -1.0, y, 1, y2, 1);
+	
+	        TYPE err = CPU_ASUM(size, y2, 1);
+	        int max = CPU_IAMAX(size, y2, 1);
+	
+	        fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+	        fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
+	}
+
+	/*
+	 * 	Termination
+	 */
+
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	starpu_helper_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+#if 0
+	MPI_Finalize();
+#endif
+
+	return 0;
+}

+ 19 - 0
mpi/examples/mpi_lu/plu_example_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_example.c"

+ 19 - 0
mpi/examples/mpi_lu/plu_example_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_example.c"

+ 394 - 0
mpi/examples/mpi_lu/plu_solve.c

@@ -0,0 +1,394 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <math.h>
+#include "pxlu.h"
+
+/*
+ *	Various useful functions
+ */
+
+static double frobenius_norm(TYPE *v, unsigned n)
+{
+        double sum2 = 0.0;
+
+        /* compute sqrt(Sum(|x|^2)) */
+
+        unsigned i,j;
+        for (j = 0; j < n; j++)
+        for (i = 0; i < n; i++)
+        {
+                double a = fabsl((double)v[i+n*j]);
+                sum2 += a*a;
+        }
+
+        return sqrt(sum2);
+}
+
+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
+{
+	if (!STARPU_PLU(display_flag)())
+		return;
+
+	fprintf(stderr, "DISPLAY BLOCK\n");
+
+	unsigned i, j;
+	for (j = 0; j < blocksize; j++)
+	{
+		for (i = 0; i < blocksize; i++)
+		{
+			fprintf(stderr, "%f ", data[j+i*blocksize]);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	fprintf(stderr, "****\n");
+}
+
+void STARPU_PLU(extract_upper)(unsigned block_size, TYPE *inblock, TYPE *outblock)
+{
+	unsigned li, lj;
+	for (lj = 0; lj < block_size; lj++)
+	{
+		/* Upper block diag is 1 */
+		outblock[lj*(block_size + 1)] = (TYPE)1.0;
+
+		for (li = lj + 1; li < block_size; li++)
+		{
+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
+		}
+	}
+}
+
+void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outblock)
+{
+	unsigned li, lj;
+	for (lj = 0; lj < block_size; lj++)
+	{
+		for (li = 0; li <= lj; li++)
+		{
+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
+		}
+	}
+}
+
+/*
+ *	Compute Ax = y
+ */
+
+static void STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	fprintf(stderr, "block data %p sub x %p sub y %p\n", block_data, sub_x, sub_y);
+	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
+}
+
+static void STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	unsigned block_size = size/nblocks;
+
+	/* Take a copy of the upper part of the diagonal block */
+	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
+	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
+		
+	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
+	
+	free(upper_block_copy);
+}
+
+static void STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	unsigned block_size = size/nblocks;
+
+	/* Take a copy of the upper part of the diagonal block */
+	TYPE *lower_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
+	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
+
+	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
+	
+	free(lower_block_copy);
+}
+
+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
+{
+	/* Create temporary buffers where all MPI processes are going to
+	 * compute Ui x = yi where Ai is the matrix containing the blocks of U
+	 * affected to process i, and 0 everywhere else. We then have y as the
+	 * sum of all yi. */
+	TYPE *yi = calloc(size, sizeof(TYPE));
+
+	fprintf(stderr, "Compute LU\n");
+
+	unsigned block_size = size/nblocks;
+
+	/* Compute UiX = Yi */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		if (get_block_rank(j, j) == rank)
+		{
+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
+			TYPE *sub_x = &x[j*(block_size)];
+			TYPE *sub_yi = &yi[j*(block_size)];
+
+			STARPU_PLU(compute_ax_block_upper)(size, nblocks, block_data, sub_x, sub_yi);
+		}
+
+		for (i = j + 1; i < nblocks; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
+				TYPE *sub_x = &x[i*(block_size)];
+				TYPE *sub_yi = &yi[j*(block_size)];
+
+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
+			}
+		}
+	}
+
+	/* Grab Sum Yi in X */
+	MPI_Reduce(yi, x, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+	memset(yi, 0, size*sizeof(TYPE));
+
+//	unsigned ind;
+//	if (rank == 0)
+//	{
+//		fprintf(stderr, "INTERMEDIATE\n");
+//		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
+//		{
+//			fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]);
+//		}
+//		fprintf(stderr, "****\n");
+//	}
+
+	/* Everyone needs x */
+	int bcst_ret;
+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
+
+	/* Compute LiX = Yi (with X = UX) */
+	for (j = 0; j < nblocks; j++)
+	{
+		if (j > 0)
+		for (i = 0; i < j; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
+				TYPE *sub_x = &x[i*(block_size)];
+				TYPE *sub_yi = &yi[j*(block_size)];
+
+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
+			}
+		}
+
+		if (get_block_rank(j, j) == rank)
+		{
+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
+			TYPE *sub_x = &x[j*(block_size)];
+			TYPE *sub_yi = &yi[j*(block_size)];
+
+			STARPU_PLU(compute_ax_block_lower)(size, nblocks, block_data, sub_x, sub_yi);
+		}
+	}
+
+	/* Grab Sum Yi in Y */
+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+	free(yi);
+}
+
+
+
+/*
+ *	Allocate a contiguous matrix on node 0 and fill it with the whole
+ *	content of the matrix distributed accross all nodes.
+ */
+
+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
+{
+//	fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks);
+
+	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
+
+	unsigned block_size = size/nblocks;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		TYPE *block;
+
+		int block_rank = get_block_rank(bi, bj);
+		
+		if (block_rank == 0)
+		{
+			block = STARPU_PLU(get_block)(bi, bj);
+		}
+		else {
+			MPI_Status status;
+
+			if (rank == 0)
+			{
+				block = calloc(block_size*block_size, sizeof(TYPE));
+
+				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status);
+				STARPU_ASSERT(ret == MPI_SUCCESS);
+			}
+			else if (rank == block_rank) {
+				block = STARPU_PLU(get_block)(bi, bj);
+				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
+				STARPU_ASSERT(ret == MPI_SUCCESS);
+			}
+		}
+
+		if (rank == 0)
+		{
+			unsigned j, i;
+			for (j = 0; j < block_size; j++)
+			for (i = 0; i < block_size; i++)
+			{
+				bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
+									block[j+i*block_size];
+			}
+
+			if (get_block_rank(bi, bj) != 0)
+				free(block);
+		}
+	}
+
+	return bigmatrix;
+}
+
+/* x and y must be valid (at least) on 0 */
+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
+{
+	unsigned block_size = size/nblocks;
+
+	/* Send x to everyone */
+	int bcst_ret;
+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
+
+	/* Create temporary buffers where all MPI processes are going to
+	 * compute Ai x = yi where Ai is the matrix containing the blocks of A
+	 * affected to process i, and 0 everywhere else. We then have y as the
+	 * sum of all yi. */
+	TYPE *yi = calloc(size, sizeof(TYPE));
+
+	/* Compute Aix = yi */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
+				TYPE *sub_x = &x[i*block_size];
+				TYPE *sub_yi = &yi[j*block_size];
+
+				STARPU_PLU(compute_ax_block)(block_size, block_data, sub_x, sub_yi);
+			}
+		}
+	}
+
+	/* Compute the Sum of all yi = y */
+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+	fprintf(stderr, "RANK %d - FOO 1 y[0] %f\n", rank, y[0]);
+
+	free(yi);
+}
+
+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved)
+{
+	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+	unsigned display = STARPU_PLU(display_flag)();
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	if (rank == 0)
+	{
+	        TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
+	        TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
+	
+	        memset(L, 0, size*size*sizeof(TYPE));
+	        memset(U, 0, size*size*sizeof(TYPE));
+	
+	        /* only keep the lower part */
+		unsigned i, j;
+	        for (j = 0; j < size; j++)
+	        {
+	                for (i = 0; i < j; i++)
+	                {
+	                        L[j+i*size] = all_r[j+i*size];
+	                }
+	
+	                /* diag i = j */
+	                L[j+j*size] = all_r[j+j*size];
+	                U[j+j*size] = 1.0;
+	
+	                for (i = j+1; i < size; i++)
+	                {
+	                        U[j+i*size] = all_r[j+i*size];
+	                }
+	        }
+	
+		STARPU_PLU(display_data_content)(L, size);
+		STARPU_PLU(display_data_content)(U, size);
+	
+	        /* now A_err = L, compute L*U */
+	        CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+	
+		if (display)
+			fprintf(stderr, "\nLU\n");
+
+		STARPU_PLU(display_data_content)(L, size);
+	
+	        /* compute "LU - A" in L*/
+	        CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
+	
+	        TYPE err = CPU_ASUM(size*size, L, 1);
+	        int max = CPU_IAMAX(size*size, L, 1);
+	
+		if (display)
+			fprintf(stderr, "DISPLAY ERROR\n");
+
+		STARPU_PLU(display_data_content)(L, size);
+	
+	        fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
+	        fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
+	
+		double residual = frobenius_norm(L, size);
+		double matnorm = frobenius_norm(Asaved, size);
+	
+		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
+	}
+}
+

+ 19 - 0
mpi/examples/mpi_lu/plu_solve_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_solve.c"

+ 19 - 0
mpi/examples/mpi_lu/plu_solve_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_solve.c"

+ 19 - 0
mpi/examples/mpi_lu/pslu.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "pxlu.c"

+ 19 - 0
mpi/examples/mpi_lu/pslu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "pxlu_kernels.c"

+ 870 - 0
mpi/examples/mpi_lu/pxlu.c

@@ -0,0 +1,870 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pxlu.h"
+#include "pxlu_kernels.h"
+#include <sys/time.h>
+
+#define MPI_TAG11(k)	((1U << 16) | (k))
+#define MPI_TAG12(k, j)	((2U << 16) | (k)<<8 | (j))
+#define MPI_TAG21(k, i)	((3U << 16) | (i)<<8 | (k))
+
+// 11 21
+// 12 22
+
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<50) | (unsigned long long)(k)))
+#define TAG12(k,j)	((starpu_tag_t)(((2ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG21(k,i)	((starpu_tag_t)(((3ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<50) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+#define TAG11_SAVE(k)	((starpu_tag_t)( (5ULL<<50) | (unsigned long long)(k)))
+#define TAG12_SAVE(k,j)	((starpu_tag_t)(((6ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG21_SAVE(k,i)	((starpu_tag_t)(((7ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+
+#define TAG11_SAVE_PARTIAL(k)	((starpu_tag_t)( (8ULL<<50) | (unsigned long long)(k)))
+#define TAG12_SAVE_PARTIAL(k,j)	((starpu_tag_t)(((9ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG21_SAVE_PARTIAL(k,i)	((starpu_tag_t)(((10ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+
+#define STARPU_TAG_INIT	((starpu_tag_t)(11ULL<<50))
+
+//#define VERBOSE_INIT	1
+
+//#define DEBUG	1
+
+static unsigned no_prio = 0;
+
+static unsigned nblocks = 0;
+static int rank = -1;
+static int world_size = -1;
+
+struct callback_arg {
+	unsigned i, j, k;
+};
+
+/*
+ *	Various
+ */
+
+static struct debug_info *create_debug_info(unsigned i, unsigned j, unsigned k)
+{
+	struct debug_info *info = malloc(sizeof(struct debug_info));
+
+	info->i = i;
+	info->j = j;
+	info->k = k;
+
+	return info;
+}
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+
+	task->use_tag = 1;
+	task->tag_id = id;
+
+	return task;
+}
+
+/* Send handle to every node appearing in the mask, and unlock tag once the
+ * transfers are done. */
+static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
+{
+	unsigned cnt = 0;
+
+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
+
+	int rank_array[world_size];
+	MPI_Comm comm_array[world_size];
+	int mpi_tag_array[world_size];
+	starpu_data_handle_t handle_array[world_size];
+
+	unsigned r;
+	for (r = 0; r < world_size; r++)
+	{
+		if (rank_mask[r]) {
+			rank_array[cnt] = r;
+
+			comm_array[cnt] = MPI_COMM_WORLD;
+			mpi_tag_array[cnt] = mpi_tag;
+			handle_array[cnt] = handle;
+			cnt++;
+		}
+	}
+
+	if (cnt == 0)
+	{
+		/* In case there is no message to send, we release the tag at
+		 * once */
+		starpu_tag_notify_from_apps(tag);
+	}
+	else {
+		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
+				rank_array, mpi_tag_array, comm_array, tag);
+	}
+}
+
+/* Initiate a receive request once all dependencies are fulfilled and unlock
+ * tag 'unlocked_tag' once it's done. */
+
+struct recv_when_done_callback_arg {
+	int source;
+	int mpi_tag;
+	starpu_data_handle_t handle;
+	starpu_tag_t unlocked_tag;
+};
+
+static void callback_receive_when_done(void *_arg)
+{
+	struct recv_when_done_callback_arg *arg = _arg;
+
+	starpu_mpi_irecv_detached_unlock_tag(arg->handle, arg->source,
+			arg->mpi_tag, MPI_COMM_WORLD, arg->unlocked_tag);
+
+	free(arg);
+}
+
+static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
+				int source, int mpi_tag,
+				starpu_data_handle_t handle,
+				starpu_tag_t partial_tag,
+				starpu_tag_t unlocked_tag)
+{
+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
+
+	struct recv_when_done_callback_arg *arg =
+		malloc(sizeof(struct recv_when_done_callback_arg));
+	
+	arg->source = source;
+	arg->mpi_tag = mpi_tag;
+	arg->handle = handle;
+	arg->unlocked_tag = unlocked_tag;
+
+	if (ndeps == 0)
+	{
+		callback_receive_when_done(arg);
+		return;
+	}
+
+	starpu_create_sync_task(partial_tag, ndeps, deps_tags,
+					callback_receive_when_done, arg);
+}
+
+/*
+ *	Task 11 (diagonal factorization)
+ */
+
+static void create_task_11_recv(unsigned k)
+{
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 11_(k-1) can be used by 12_(k-1)j and
+	 * 21(k-1)i with i,j >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[2*nblocks];
+	
+#ifdef SINGLE_TMP11
+	unsigned i, j;
+	if (k > 0)
+	for (i = (k-1)+1; i < nblocks; i++)
+	{
+		if (rank == get_block_rank(i, k-1))
+			tag_array[ndeps++] = TAG21(k-1, i);
+	}
+
+	if (k > 0)
+	for (j = (k-1)+1; j < nblocks; j++)
+	{
+		if (rank == get_block_rank(k-1, j))
+			tag_array[ndeps++] = TAG12(k-1, j);
+	}
+#endif
+	
+	int source = get_block_rank(k, k);
+#ifdef SINGLE_TMP11
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
+	int mpi_tag = MPI_TAG11(k);
+	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
+	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
+
+//	fprintf(stderr, "NODE %d - 11 (%d) - recv when done ndeps %d - tag array %lx\n", rank, k, ndeps, tag_array[0]);
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_11(unsigned k, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 11_k is used to compute 12_kj + 12ki with i,j > k */
+	unsigned i;
+	for (i = k+1; i < nblocks; i++)
+	{
+		int r = get_block_rank(i, k);
+		rank_mask[r] = 1;
+	}
+
+	unsigned j;
+	for (j = k+1; j < nblocks; j++)
+	{
+		int r = get_block_rank(k, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_11_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_11(k, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, k);
+	starpu_tag_t tag = TAG11_SAVE(k);
+	int mpi_tag = MPI_TAG11(k);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_11_real(unsigned k)
+{
+	struct starpu_task *task = create_task(TAG11(k));
+
+	task->cl = &STARPU_PLU(cl11);
+
+	task->cl_arg = create_debug_info(k, k, k);
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = STARPU_PLU(get_block_handle)(k, k);
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->k = k;
+
+	task->callback_func = callback_task_11_real;
+	task->callback_arg = arg;
+
+	/* this is an important task */
+	if (!no_prio)
+		task->priority = STARPU_MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+	else {
+		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
+	}
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static void create_task_11(unsigned k)
+{
+	if (get_block_rank(k, k) == rank)
+	{
+#ifdef VERBOSE_INIT
+		fprintf(stderr, "CREATE real task 11(%d) (TAG11_SAVE(%d) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
+#endif
+		create_task_11_real(k);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_11(k, rank_mask);
+		
+		if (rank_mask[rank])
+		{
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "create RECV task 11(%d) on node %d\n", k, rank);
+#endif
+			create_task_11_recv(k);
+		}
+		else {
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "Node %d needs not 11(%d)\n", rank, k);
+#endif
+		}
+	}
+}
+
+
+
+/*
+ *	Task 12 (Update lower left (TRSM))
+ */
+
+static void create_task_12_recv(unsigned k, unsigned j)
+{
+	unsigned i;
+
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 12_(k-1)j can be used by 22_(k-1)ij with
+	 * i >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[nblocks];
+	
+#ifdef SINGLE_TMP1221
+	if (k > 0)
+	for (i = (k-1)+1; i < nblocks; i++)
+#else
+	if (k > 1)
+	for (i = (k-2)+1; i < nblocks; i++)
+#endif
+	{
+		if (rank == get_block_rank(i, j))
+#ifdef SINGLE_TMP1221
+			tag_array[ndeps++] = TAG22(k-1, i, j);
+#else
+			tag_array[ndeps++] = TAG22(k-2, i, j);
+#endif
+	}
+	
+	int source = get_block_rank(k, j);
+#ifdef SINGLE_TMP1221
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
+#else
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j,k);
+#endif
+	int mpi_tag = MPI_TAG12(k, j);
+	starpu_tag_t partial_tag = TAG12_SAVE_PARTIAL(k, j);
+	starpu_tag_t unlocked_tag = TAG12_SAVE(k, j);
+
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_12(unsigned k, unsigned j, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 12_kj is used to compute 22_kij with i > k */
+	unsigned i;
+	for (i = k+1; i < nblocks; i++)
+	{
+		int r = get_block_rank(i, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_12_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+	unsigned j = arg->j;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_12(k, j, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, j);
+	starpu_tag_t tag = TAG12_SAVE(k, j);
+	int mpi_tag = MPI_TAG12(k, j);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_12_real(unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG12(k, j));
+	
+#warning temporary fix :/
+//	task->cl = &STARPU_PLU(cl12);
+	task->cl = &STARPU_PLU(cl21);
+
+	task->cl_arg = create_debug_info(j, j, k);
+
+	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
+
+	starpu_tag_t tag_11_dep; 
+
+	/* which sub-data is manipulated ? */
+	starpu_data_handle_t diag_block;
+	if (diag_block_is_local)
+	{
+		diag_block = STARPU_PLU(get_block_handle)(k, k);
+		tag_11_dep = TAG11(k);
+	}
+	else 
+	{
+#ifdef SINGLE_TMP11
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
+		tag_11_dep = TAG11_SAVE(k);
+	}
+
+	task->handles[0] = diag_block; 
+	task->handles[1] = STARPU_PLU(get_block_handle)(k, j); 
+
+	STARPU_ASSERT(get_block_rank(k, j) == rank);
+
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->j = j;
+		arg->k = k;
+
+	task->callback_func = callback_task_12_real;
+	task->callback_arg = arg;
+
+	if (!no_prio && (j == k+1)) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG12(k, j), 2, tag_11_dep, TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG12(k, j), 1, tag_11_dep);
+	}
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static void create_task_12(unsigned k, unsigned j)
+{
+	if (get_block_rank(k, j) == rank)
+	{
+#ifdef VERBOSE_INIT
+		fprintf(stderr, "CREATE real task 12(k = %d, j = %d) on node %d\n", k, j, rank);
+#endif
+		create_task_12_real(k, j);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_12(k, j, rank_mask);
+		
+		if (rank_mask[rank])
+		{
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "create RECV task 12(k = %d, j = %d) on node %d\n", k, j, rank);
+#endif
+			create_task_12_recv(k, j);
+		}
+		else {
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "Node %d needs not 12(k=%d, i=%d)\n", rank, k, j);
+#endif
+		}
+	}
+}
+
+/*
+ *	Task 21 (Update upper right (TRSM))
+ */
+
+static void create_task_21_recv(unsigned k, unsigned i)
+{
+	unsigned j;
+
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 21_(k-1)i can be used by 22_(k-1)ij with
+	 * j >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[nblocks];
+	
+#ifdef SINGLE_TMP1221
+	if (k > 0)
+	for (j = (k-1)+1; j < nblocks; j++)
+#else
+	if (k > 1)
+	for (j = (k-2)+1; j < nblocks; j++)
+#endif
+	{
+		if (rank == get_block_rank(i, j))
+#ifdef SINGLE_TMP1221
+			tag_array[ndeps++] = TAG22(k-1, i, j);
+#else
+			tag_array[ndeps++] = TAG22(k-2, i, j);
+#endif
+	}
+
+	int source = get_block_rank(i, k);
+#ifdef SINGLE_TMP1221
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i);
+#else
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i, k);
+#endif
+	int mpi_tag = MPI_TAG21(k, i);
+	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
+	starpu_tag_t unlocked_tag = TAG21_SAVE(k, i);
+
+//	fprintf(stderr, "NODE %d - 21 (%d, %d) - recv when done ndeps %d - tag array %lx\n", rank, k, i, ndeps, tag_array[0]);
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_21(unsigned k, unsigned i, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 21_ki is used to compute 22_kij with j > k */
+	unsigned j;
+	for (j = k+1; j < nblocks; j++)
+	{
+		int r = get_block_rank(i, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_21_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+	unsigned i = arg->i;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_21(k, i, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(i, k);
+	starpu_tag_t tag = TAG21_SAVE(k, i);
+	int mpi_tag = MPI_TAG21(k, i);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_21_real(unsigned k, unsigned i)
+{
+	struct starpu_task *task = create_task(TAG21(k, i));
+
+#warning temporary fix 
+//	task->cl = &STARPU_PLU(cl21);
+	task->cl = &STARPU_PLU(cl12);
+
+	task->cl_arg = create_debug_info(i, i, k);
+
+	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
+
+	starpu_tag_t tag_11_dep; 
+	
+	/* which sub-data is manipulated ? */
+	starpu_data_handle_t diag_block;
+	if (diag_block_is_local)
+	{
+		diag_block = STARPU_PLU(get_block_handle)(k, k);
+		tag_11_dep = TAG11(k);
+	}
+	else 
+	{
+#ifdef SINGLE_TMP11
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
+		tag_11_dep = TAG11_SAVE(k);
+	}
+
+	task->handles[0] = diag_block; 
+	task->handles[1] = STARPU_PLU(get_block_handle)(i, k);
+
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->i = i;
+		arg->k = k;
+
+	task->callback_func = callback_task_21_real;
+	task->callback_arg = arg;
+
+	if (!no_prio && (i == k+1)) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, i), 2, tag_11_dep, TAG22(k-1, i, k));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, i), 1, tag_11_dep);
+	}
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static void create_task_21(unsigned k, unsigned i)
+{
+	if (get_block_rank(i, k) == rank)
+	{
+#ifdef VERBOSE_INIT
+		fprintf(stderr, "CREATE real task 21(k = %d, i = %d) on node %d\n", k, i, rank);
+#endif
+		create_task_21_real(k, i);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_21(k, i, rank_mask);
+		
+		if (rank_mask[rank])
+		{
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "create RECV task 21(k = %d, i = %d) on node %d\n", k, i, rank);
+#endif
+			create_task_21_recv(k, i);
+		}
+		else {
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "Node %d needs not 21(k=%d, i=%d)\n", rank, k,i);
+#endif
+		}
+	}
+}
+
+/*
+ *	Task 22 (GEMM)
+ */
+
+static void create_task_22_real(unsigned k, unsigned i, unsigned j)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &STARPU_PLU(cl22);
+
+	task->cl_arg = create_debug_info(i, j, k);
+
+	/* which sub-data is manipulated ? */
+
+	/* produced by TAG21_SAVE(k, i) */ 
+	unsigned block21_is_local = (get_block_rank(i, k) == rank);
+	starpu_tag_t tag_21_dep;
+
+	starpu_data_handle_t block21;
+	if (block21_is_local)
+	{
+		block21 = STARPU_PLU(get_block_handle)(i, k);
+		tag_21_dep = TAG21(k, i);
+	}
+	else 
+	{
+#ifdef SINGLE_TMP1221
+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
+#else
+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i, k);
+#endif
+		tag_21_dep = TAG21_SAVE(k, i);
+	}
+
+	/* produced by TAG12_SAVE(k, j) */
+	unsigned block12_is_local = (get_block_rank(k, j) == rank);
+	starpu_tag_t tag_12_dep;
+
+	starpu_data_handle_t block12;
+	if (block12_is_local)
+	{
+	//	block12 = STARPU_PLU(get_block_handle)(j, k);
+		block12 = STARPU_PLU(get_block_handle)(k, j);
+		tag_12_dep = TAG12(k, j);
+	}
+	else 
+	{
+#ifdef SINGLE_TMP1221
+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
+#else
+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j, k);
+#endif
+		tag_12_dep = TAG12_SAVE(k, j);
+	}
+
+
+
+#warning temporary fix :/
+	//task->handles[0] = block21;
+	task->handles[0] = block12;
+
+	//task->handles[1] = block12;
+	task->handles[1] = block21;
+
+	/* produced by TAG22(k-1, i, j) */
+	task->handles[2] = STARPU_PLU(get_block_handle)(i, j);
+
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
+
+	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), tag_12_dep, tag_21_dep);
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, tag_12_dep, tag_21_dep);
+	}
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static void create_task_22(unsigned k, unsigned i, unsigned j)
+{
+	if (get_block_rank(i, j) == rank)
+	{
+	//	fprintf(stderr, "CREATE real task 22(k = %d, i = %d, j = %d) on node %d\n", k, i, j, rank);
+		create_task_22_real(k, i, j);
+	}
+//	else {
+//		fprintf(stderr, "Node %d needs not 22(k=%d, i=%d, j = %d)\n", rank, k,i,j);
+//	}
+}
+
+static void wait_tag_and_fetch_handle(starpu_tag_t tag, starpu_data_handle_t handle)
+{
+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
+
+	starpu_tag_wait(tag);
+//	fprintf(stderr, "Rank %d : tag %lx is done\n", rank, tag);
+
+	starpu_data_acquire(handle, STARPU_R);
+
+//	starpu_data_unregister(handle);
+}
+
+static void wait_termination(void)
+{
+	unsigned k, i, j;
+	for (k = 0; k < nblocks; k++)
+	{
+		/* Wait task 11k if needed */
+		if (get_block_rank(k, k) == rank)
+		{
+			starpu_data_handle_t diag_block = STARPU_PLU(get_block_handle)(k, k);
+			wait_tag_and_fetch_handle(TAG11_SAVE(k), diag_block);
+		}
+		
+
+		for (i = k + 1; i < nblocks; i++)
+		{
+			/* Wait task 21ki if needed */
+			if (get_block_rank(i, k) == rank)
+			{
+				starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(i, k);
+				//starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(k, i);
+				//fprintf(stderr, "BLOCK21 i %d k %d -> handle %p\n", i, k, block21);
+				wait_tag_and_fetch_handle(TAG21_SAVE(k, i), block21);
+			}
+		}
+
+		for (j = k + 1; j < nblocks; j++)
+		{
+			/* Wait task 12kj if needed */
+			if (get_block_rank(k, j) == rank)
+			{
+				//starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(j, k);
+				starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(k, j);
+				//fprintf(stderr, "BLOCK12 j %d k %d -> handle %p\n", j, k, block12);
+				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
+			}
+		}
+	}	
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
+{
+	struct timeval start;
+	struct timeval end;
+
+	nblocks = _nblocks;
+	rank = _rank;
+	world_size = _world_size;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		create_task_11(k);
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(k, i);
+			create_task_21(k, i);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(k, i, j);
+			}
+		}
+	}
+
+	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+
+	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
+
+	wait_termination();
+	
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	
+//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
+	
+	return timing;
+}

+ 65 - 0
mpi/examples/mpi_lu/pxlu.h

@@ -0,0 +1,65 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PXLU_H__
+#define __PXLU_H__
+
+#include <starpu.h>
+#include <common/blas.h>
+#include <starpu_mpi.h>
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+//#define SINGLE_TMP11	1
+//#define SINGLE_TMP1221	1
+
+struct debug_info {
+	unsigned i;
+	unsigned j;
+	unsigned k;
+};
+
+double STARPU_PLU(plu_main)(unsigned nblocks, int rank, int world_size);
+
+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks);
+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved);
+
+unsigned STARPU_PLU(display_flag)(void);
+
+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j);
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j);
+#ifdef SINGLE_TMP11
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void);
+#else
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k);
+#endif
+#ifdef SINGLE_TMP1221
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
+#else
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k);
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k);
+#endif
+
+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize);
+
+int get_block_rank(unsigned i, unsigned j);
+
+#endif // __PXLU_H__

+ 444 - 0
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -0,0 +1,444 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pxlu.h"
+#include "pxlu_kernels.h"
+#include <math.h>
+
+///#define VERBOSE_KERNELS	1
+
+/*
+ *   U22 
+ */
+
+static inline void STARPU_PLU(common_u22)(void *descr[],
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *right 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *left 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	TYPE *center 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
+
+#ifdef VERBOSE_KERNELS
+	struct debug_info *info = _args;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d\n", rank, info->k, info->i, info->j);
+#endif
+
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	switch (s) {
+		case 0:
+			CPU_GEMM("N", "N", dy, dx, dz, 
+				(TYPE)-1.0, right, ld21, left, ld12,
+				(TYPE)1.0, center, ld22);
+			break;
+
+#ifdef STARPU_USE_CUDA
+		case 1:
+			CUBLAS_GEMM('n', 'n', dx, dy, dz,
+				(TYPE)-1.0, right, ld21, left, ld12,
+				(TYPE)1.0f, center, ld22);
+
+			status = cublasGetError();
+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
+				STARPU_CUBLAS_REPORT_ERROR(status);
+
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+#ifdef VERBOSE_KERNELS
+	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d done\n", rank, info->k, info->i, info->j);
+#endif
+}
+
+static void STARPU_PLU(cpu_u22)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u22)(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u22)(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA
+
+static struct starpu_perfmodel STARPU_PLU(model_22) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_22_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_22)
+#endif
+};
+
+struct starpu_codelet STARPU_PLU(cl22) = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {STARPU_PLU(cpu_u22), NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {STARPU_PLU(cublas_u22), NULL},
+#endif
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &STARPU_PLU(model_22)
+};
+
+
+/*
+ * U12
+ */
+
+static inline void STARPU_PLU(common_u12)(void *descr[],
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+	TYPE *sub12;
+
+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);	
+	sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
+	unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
+
+#ifdef VERBOSE_KERNELS
+	struct debug_info *info = _args;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#warning fixed debugging according to other tweak
+	//fprintf(stderr, "KERNEL 12 %d - k = %d i %d\n", rank, info->k, info->i);
+	fprintf(stderr, "KERNEL 21 %d - k = %d i %d\n", rank, info->k, info->j);
+
+	//fprintf(stderr, "INPUT 12 U11\n");
+	fprintf(stderr, "INPUT 21 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx12);
+	//fprintf(stderr, "INPUT 12 U12\n");
+	fprintf(stderr, "INPUT 21 U21\n");
+	STARPU_PLU(display_data_content)(sub12, nx12);
+#endif
+
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	/* solve L11 U12 = A12 (find U12) */
+	switch (s) {
+		case 0:
+			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
+					(TYPE)1.0, sub11, ld11, sub12, ld12);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
+					(TYPE)1.0, sub11, ld11, sub12, ld12);
+
+			status = cublasGetError();
+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
+				STARPU_CUBLAS_REPORT_ERROR(status);
+
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+
+#ifdef VERBOSE_KERNELS
+	//fprintf(stderr, "OUTPUT 12 U12\n");
+	fprintf(stderr, "OUTPUT 21 U21\n");
+	STARPU_PLU(display_data_content)(sub12, nx12);
+#endif
+}
+
+static void STARPU_PLU(cpu_u12)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u12)(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u12)(descr, 1, _args);
+}
+#endif // STARPU_USE_CUDA
+
+static struct starpu_perfmodel STARPU_PLU(model_12) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_12_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_12)
+#endif
+};
+
+struct starpu_codelet STARPU_PLU(cl12) = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {STARPU_PLU(cpu_u12), NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {STARPU_PLU(cublas_u12), NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &STARPU_PLU(model_12)
+};
+
+
+/* 
+ * U21
+ */
+
+static inline void STARPU_PLU(common_u21)(void *descr[],
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+	TYPE *sub21;
+
+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	sub21 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
+	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
+	
+#ifdef VERBOSE_KERNELS
+	struct debug_info *info = _args;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#warning fixed debugging according to other tweak
+	//fprintf(stderr, "KERNEL 21 %d (k = %d, i = %d)\n", rank, info->k, info->i);
+	fprintf(stderr, "KERNEL 12 %d (k = %d, j = %d)\n", rank, info->k, info->j);
+
+	//fprintf(stderr, "INPUT 21 U11\n");
+	fprintf(stderr, "INPUT 12 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx21);
+	//fprintf(stderr, "INPUT 21 U21\n");
+	fprintf(stderr, "INPUT 12 U12\n");
+	STARPU_PLU(display_data_content)(sub21, nx21);
+#endif
+
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+#endif
+
+
+	switch (s) {
+		case 0:
+			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
+					(TYPE)1.0, sub11, ld11, sub21, ld21);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
+					(TYPE)1.0, sub11, ld11, sub21, ld21);
+
+			status = cublasGetError();
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_CUBLAS_REPORT_ERROR(status);
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+
+#ifdef VERBOSE_KERNELS
+	//fprintf(stderr, "OUTPUT 21 U11\n");
+	fprintf(stderr, "OUTPUT 12 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx21);
+	//fprintf(stderr, "OUTPUT 21 U21\n");
+	fprintf(stderr, "OUTPUT 12 U12\n");
+	STARPU_PLU(display_data_content)(sub21, nx21);
+#endif
+}
+
+static void STARPU_PLU(cpu_u21)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u21)(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u21)(descr, 1, _args);
+}
+#endif 
+
+static struct starpu_perfmodel STARPU_PLU(model_21) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_21_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_21)
+#endif
+};
+
+struct starpu_codelet STARPU_PLU(cl21) = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {STARPU_PLU(cpu_u21), NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {STARPU_PLU(cublas_u21), NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &STARPU_PLU(model_21)
+};
+
+
+/*
+ *	U11
+ */
+
+static inline void STARPU_PLU(common_u11)(void *descr[],
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+
+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]); 
+
+	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
+	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
+
+	unsigned long z;
+
+#ifdef VERBOSE_KERNELS
+	struct debug_info *info = _args;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
+#endif
+
+	switch (s) {
+		case 0:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				pivot = sub11[z+z*ld];
+				STARPU_ASSERT(pivot != 0.0);
+		
+				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
+		
+				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+				STARPU_ASSERT(pivot != 0.0);
+				
+				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+				
+				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+#ifdef VERBOSE_KERNELS
+	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
+#endif
+}
+
+static void STARPU_PLU(cpu_u11)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u11)(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u11)(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA
+
+static struct starpu_perfmodel STARPU_PLU(model_11) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_11_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_11)
+#endif
+};
+
+struct starpu_codelet STARPU_PLU(cl11) = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {STARPU_PLU(cpu_u11), NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {STARPU_PLU(cublas_u11), NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &STARPU_PLU(model_11)
+};
+
+

+ 32 - 0
mpi/examples/mpi_lu/pxlu_kernels.h

@@ -0,0 +1,32 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PXLU_KERNELS_H__
+#define __PXLU_KERNELS_H__
+
+#include <starpu.h>
+
+#define str(s) #s
+#define xstr(s)        str(s)
+#define STARPU_PLU_STR(name)  xstr(STARPU_PLU(name))
+
+struct starpu_codelet STARPU_PLU(cl11);
+struct starpu_codelet STARPU_PLU(cl12);
+struct starpu_codelet STARPU_PLU(cl21);
+struct starpu_codelet STARPU_PLU(cl22);
+
+#endif // __PXLU_KERNELS_H__

+ 19 - 0
mpi/examples/mpi_lu/slu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "xlu_kernels.c"

+ 106 - 0
mpi/examples/perf.sh

@@ -0,0 +1,106 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010  Centre National de la Recherche Scientifique
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# 4G x np = 4 * (k*1K) ^ 2
+# A G * np = 4 * k^2 * 1M
+# A * 250 * np = k^2
+# A = 6
+# k = sqrt(1500*np)
+# np = 1 => k = 32
+# np = 2 => k = 48
+# np = 3 => k = 64 
+# np = 4 => k = 64
+
+# Problem size
+NBLOCKS=16
+BLOCKSIZE=1024
+SIZE=$(($NBLOCKS*$BLOCKSIZE))
+
+echo "JOB ID ${PBS_JOBID}"
+
+nnodes=$(cat machinefile.${PBS_JOBID}|wc -l)
+echo "got $nnodes mpi nodes"
+
+# Calibrate
+ncalibrate=0
+for i in `seq 1 $ncalibrate`
+do
+echo "STARPU_CALIBRATE $i/$ncalibrate"
+STARPU_CALIBRATE=1 STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $nnodes ./mpi_lu/plu_example_float -p 2 -q 2 -nblocks 32 -size $((32*$BLOCKSIZE)) -numa
+done
+
+func()
+{
+ngpus=$1
+np=$2
+p=$3
+q=$4
+nblocks=$5
+
+echo "*******************************************"> log
+echo "*************** NGPUS $ngpus - np $np - nblocks $nblocks **************">> log
+echo "*******************************************">> log
+cat log
+cat log >> log.all
+
+STARPU_NCPUS=0 STARPU_NCUDA=$ngpus STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $np ./mpi_lu/plu_example_float -p $p -q $q -nblocks $nblocks -size $(($nblocks * $BLOCKSIZE)) -numa > log.out 2> log.err
+cat log.out > log
+cat log.err >> log
+cat log
+cat log >> log.all
+}
+
+rm -f log.all
+
+#how many time do we repeat each experiment ?
+nloops=3
+
+per_node_max_memory=7000
+
+for np in 1 2 4
+do
+	for nblocks in 16 32 48 64 80
+	do
+		for ngpus_per_node in 1 2 3 4
+		do
+			for loop in `seq 1 $nloops`
+			do
+				# Compute p and q from np
+				case $np in
+				  1) p=1; q=1;;
+				  2) p=2; q=1;;
+				  4) p=2; q=2;;
+				  *) echo -n "does not support $np nodes yet";;
+				esac
+
+				# Does the problem fit into memory ?
+				matrix_size=$(($nblocks * $BLOCKSIZE))
+				per_node_memory=$(($((4*$matrix_size*$matrix_size/(1024*1024))) / $np))
+
+				echo "NP $np P $p Q $q SIZE $per_node_memory NBLOCKS $nblocks"
+
+				if test $per_node_memory -ge $per_node_max_memory; then
+						echo "Problem is too large !"
+				else
+					func $ngpus_per_node $np $p $q $nblocks
+					echo "go !"
+				fi
+			done
+		done
+	done
+done

+ 156 - 0
mpi/examples/reduction/mpi_reduction.c

@@ -0,0 +1,156 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+
+extern void init_cpu_func(void *descr[], void *cl_arg);
+extern void redux_cpu_func(void *descr[], void *cl_arg);
+extern void dot_cpu_func(void *descr[], void *cl_arg);
+
+static struct starpu_codelet init_codelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {init_cpu_func, NULL},
+	.nbuffers = 1,
+	.name = "init_codelet"
+};
+
+static struct starpu_codelet redux_codelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {redux_cpu_func, NULL},
+	.nbuffers = 2,
+	.name = "redux_codelet"
+};
+
+static struct starpu_codelet dot_codelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {dot_cpu_func, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_REDUX},
+	.name = "dot_codelet"
+};
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int nb_nodes)
+{
+	return x % nb_nodes;
+}
+
+int main(int argc, char **argv)
+{
+        int my_rank, size, x, y;
+        long int *vector;
+	long int dot, sum=0;
+        starpu_data_handle_t *handles;
+	starpu_data_handle_t dot_handle;
+
+	int nb_elements, step;
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_mpi_initialize_extended(&my_rank, &size);
+
+	nb_elements = size*8000;
+	step = 4;
+
+	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
+        for(x = 0; x < nb_elements; x+=step)
+	{
+		int mpi_rank = my_distrib(x/step, size);
+		if (mpi_rank == my_rank)
+		{
+			for(y=0 ; y<step ; y++)
+			{
+				vector[x+y] = x+y+1;
+			}
+		}
+        }
+	if (my_rank == 0) {
+		dot = 14;
+		sum = (nb_elements * (nb_elements + 1)) / 2;
+		sum+= dot;
+		starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(dot));
+	}
+	else
+	{
+		starpu_variable_data_register(&dot_handle, -1, (uintptr_t)NULL, sizeof(dot));
+	}
+
+
+	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
+        for(x = 0; x < nb_elements; x+=step)
+	{
+		int mpi_rank = my_distrib(x/step, size);
+		if (mpi_rank == my_rank)
+		{
+			/* Owning data */
+			starpu_vector_data_register(&handles[x], 0, (uintptr_t)&(vector[x]), step, sizeof(vector[0]));
+		}
+		else
+		{
+			starpu_vector_data_register(&handles[x], -1, (uintptr_t)NULL, step, sizeof(vector[0]));
+		}
+		if (handles[x])
+		{
+			starpu_data_set_rank(handles[x], mpi_rank);
+			starpu_data_set_tag(handles[x], x);
+		}
+	}
+
+	starpu_data_set_rank(dot_handle, 0);
+	starpu_data_set_tag(dot_handle, nb_elements+1);
+	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
+
+	for (x = 0; x < nb_elements; x+=step)
+	{
+		starpu_mpi_insert_task(MPI_COMM_WORLD,
+				       &dot_codelet,
+				       STARPU_R, handles[x],
+				       STARPU_REDUX, dot_handle,
+				       0);
+	}
+	starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
+
+        fprintf(stderr, "Waiting ...\n");
+        starpu_task_wait_for_all();
+
+        for(x = 0; x < nb_elements; x+=step)
+	{
+		if (handles[x]) starpu_data_unregister(handles[x]);
+	}
+	if (dot_handle)
+	{
+		starpu_data_unregister(dot_handle);
+	}
+	free(vector);
+	free(handles);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	if (my_rank == 0)
+	{
+                fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
+                fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
+		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
+        }
+
+	return 0;
+}
+

+ 66 - 0
mpi/examples/reduction/mpi_reduction_kernels.c

@@ -0,0 +1,66 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <mpi.h>
+
+#define _DISPLAY(fmt, args ...) do { \
+		int _display_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_display_rank);	\
+		fprintf(stderr, "[%d][%s] " fmt , _display_rank, __func__ ,##args); 	\
+		fflush(stderr); } while(0)
+
+/*
+ *	Codelet to create a neutral element
+ */
+void init_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	*dot = 0;
+	_DISPLAY("Init dot\n");
+}
+
+/*
+ *	Codelet to perform the reduction of two elements
+ */
+void redux_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *dota = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	long int *dotb = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	*dota = *dota + *dotb;
+	_DISPLAY("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
+}
+
+/*
+ *	Dot product codelet
+ */
+void dot_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+//	_DISPLAY("Before dot=%ld (adding %d elements...)\n", *dot, n);
+	unsigned i;
+	for (i = 0; i < n; i++)
+	{
+//		_DISPLAY("Adding %ld\n", local_x[i]);
+		*dot += local_x[i];
+	}
+//	_DISPLAY("After dot=%ld\n", *dot);
+}
+

+ 228 - 0
mpi/examples/scatter_gather/mpi_scatter_gather.c

@@ -0,0 +1,228 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+        return (x+y) % nb_nodes;
+}
+
+void cpu_codelet(void *descr[], void *_args)
+{
+	float *block;
+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned i,j;
+	int rank;
+	float factor;
+
+	block = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+        starpu_codelet_unpack_args(_args, &rank);
+	factor = block[0];
+
+	//fprintf(stderr,"rank %d factor %f\n", rank, factor);
+	for (j = 0; j < nx; j++)
+	{
+		for (i = 0; i < nx; i++)
+		{
+			//fprintf(stderr,"rank %d factor %f --> %f %f\n", rank, factor, block[j+i*ld], block[j+i*ld]*factor);
+			block[j+i*ld] *= factor;
+		}
+	}
+}
+
+static struct starpu_codelet cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {cpu_codelet, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+};
+
+int main(int argc, char **argv)
+{
+        int rank, nodes;
+	float ***bmat = NULL;
+        starpu_data_handle_t *data_handles;
+
+	unsigned i,j,x,y;
+
+	unsigned nblocks=4;
+	unsigned block_size=2;
+	unsigned size = nblocks*block_size;
+	unsigned ld = size / nblocks;
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_mpi_initialize_extended(&rank, &nodes);
+
+	if (rank == 0)
+	{
+		/* Allocate the matrix */
+		int block_number=10;
+		bmat = malloc(nblocks * sizeof(float *));
+		for(x=0 ; x<nblocks ; x++)
+		{
+			bmat[x] = malloc(nblocks * sizeof(float *));
+			for(y=0 ; y<nblocks ; y++)
+			{
+				float value=0.0;
+				starpu_malloc((void **)&bmat[x][y], block_size*block_size*sizeof(float));
+				for (i = 0; i < block_size; i++)
+				{
+					for (j = 0; j < block_size; j++)
+					{
+						bmat[x][y][j +i*block_size] = block_number + value;
+						value++;
+					}
+				}
+				block_number += 10;
+			}
+		}
+	}
+
+#if 0
+	// Print matrix
+	if (rank == 0)
+	{
+		fprintf(stderr, "Input matrix\n");
+		for(x=0 ; x<nblocks ; x++)
+		{
+			for(y=0 ; y<nblocks ; y++)
+			{
+				for (j = 0; j < block_size; j++)
+				{
+					for (i = 0; i < block_size; i++)
+					{
+						fprintf(stderr, "%2.2f\t", bmat[x][y][j+i*block_size]);
+					}
+					fprintf(stderr,"\n");
+				}
+				fprintf(stderr,"\n");
+			}
+		}
+	}
+#endif
+
+	/* Allocate data handles and register data to StarPU */
+        data_handles = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t *));
+        for(x = 0; x < nblocks ;  x++)
+	{
+                for (y = 0; y < nblocks; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (rank == 0)
+			{
+				starpu_matrix_data_register(&data_handles[x+y*nblocks], 0, (uintptr_t)bmat[x][y],
+							    ld, size/nblocks, size/nblocks, sizeof(float));
+			}
+			else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1)))
+			{
+				/* I own that index, or i will need it for my computations */
+				//fprintf(stderr, "[%d] Owning or neighbor of data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x+y*nblocks], -1, (uintptr_t)NULL,
+							    ld, size/nblocks, size/nblocks, sizeof(float));
+			}
+			else
+			{
+				/* I know it's useless to allocate anything for this */
+				data_handles[x+y*nblocks] = NULL;
+			}
+                        if (data_handles[x+y*nblocks])
+			{
+                                starpu_data_set_rank(data_handles[x+y*nblocks], mpi_rank);
+                                starpu_data_set_tag(data_handles[x+y*nblocks], (y*nblocks)+x);
+			}
+                }
+        }
+
+	/* Scatter the matrix among the nodes */
+	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD);
+
+	/* Calculation */
+	for(x = 0; x < nblocks*nblocks ;  x++)
+	{
+		if (data_handles[x])
+		{
+			int owner = starpu_data_get_rank(data_handles[x]);
+			if (owner == rank)
+			{
+				//fprintf(stderr,"[%d] Computing on data[%d]\n", rank, x);
+				starpu_insert_task(&cl,
+						   STARPU_VALUE, &rank, sizeof(rank),
+						   STARPU_RW, data_handles[x],
+						   0);
+			}
+		}
+	}
+
+	/* Gather the matrix on main node */
+	starpu_mpi_gather_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD);
+
+	/* Unregister matrix from StarPU */
+	for(x=0 ; x<nblocks*nblocks ; x++)
+	{
+		if (data_handles[x])
+		{
+			starpu_data_unregister(data_handles[x]);
+		}
+	}
+
+#if 0
+	// Print matrix
+	if (rank == 0)
+	{
+		fprintf(stderr, "Output matrix\n");
+		for(x=0 ; x<nblocks ; x++)
+		{
+			for(y=0 ; y<nblocks ; y++)
+			{
+				for (j = 0; j < block_size; j++)
+				{
+					for (i = 0; i < block_size; i++)
+					{
+						fprintf(stderr, "%2.2f\t", bmat[x][y][j+i*block_size]);
+					}
+					fprintf(stderr,"\n");
+				}
+				fprintf(stderr,"\n");
+			}
+		}
+	}
+#endif
+
+	// Free memory
+        free(data_handles);
+	if (rank == 0)
+	{
+		for(x=0 ; x<nblocks ; x++)
+		{
+			for(y=0 ; y<nblocks ; y++)
+			{
+				starpu_free((void *)bmat[x][y]);
+			}
+			free(bmat[x]);
+		}
+		free(bmat);
+	}
+
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+	return 0;
+}

+ 159 - 0
mpi/examples/stencil/stencil5.c

@@ -0,0 +1,159 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+
+void stencil5_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *xy = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *xm1y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	unsigned *xp1y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[2]);
+	unsigned *xym1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[3]);
+	unsigned *xyp1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[4]);
+
+        //        fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
+        *xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
+}
+
+struct starpu_codelet stencil5_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {stencil5_cpu, NULL},
+        .nbuffers = 5,
+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
+};
+
+#define NITER_DEF 500
+#define X         20
+#define Y         20
+
+int display = 0;
+int niter = NITER_DEF;
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+	/* Block distrib */
+	return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
+}
+
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-iter") == 0)
+		{
+			char *argptr;
+			niter = strtol(argv[++i], &argptr, 10);
+		}
+		if (strcmp(argv[i], "-display") == 0)
+		{
+			display = 1;
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+        int my_rank, size, x, y, loop;
+        int value=0, mean=0;
+        unsigned matrix[X][Y];
+        starpu_data_handle_t data_handles[X][Y];
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_mpi_initialize_extended(&my_rank, &size);
+        parse_args(argc, argv);
+
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        matrix[x][y] = (my_rank+1)*10 + value;
+                        value++;
+                        mean += matrix[x][y];
+                }
+        }
+        mean /= value;
+
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        int mpi_rank = my_distrib(x, y, size);
+                        if (mpi_rank == my_rank)
+			{
+                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
+                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
+                        }
+			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
+			      || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
+			{
+                                /* I don't own that index, but will need it for my computations */
+                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
+                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
+                        }
+                        else
+			{
+                                /* I know it's useless to allocate anything for this */
+                                data_handles[x][y] = NULL;
+                        }
+                        if (data_handles[x][y])
+			{
+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
+                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
+			}
+                }
+        }
+
+        for(loop=0 ; loop<niter; loop++)
+	{
+                for (x = 1; x < X-1; x++)
+		{
+                        for (y = 1; y < Y-1; y++)
+			{
+                                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
+                                                       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
+                                                       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
+                                                       0);
+                        }
+                }
+        }
+        fprintf(stderr, "Waiting ...\n");
+        starpu_task_wait_for_all();
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+        if (display)
+	{
+                fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
+                for(x = 0; x < X; x++)
+		{
+                        fprintf(stdout, "[%d] ", my_rank);
+                        for (y = 0; y < Y; y++)
+			{
+                                fprintf(stdout, "%3u ", matrix[x][y]);
+                        }
+                        fprintf(stdout, "\n");
+                }
+        }
+
+	return 0;
+}

+ 70 - 0
mpi/include/starpu_mpi.h

@@ -0,0 +1,70 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_H__
+#define __STARPU_MPI_H__
+
+#include <starpu.h>
+
+#if defined(STARPU_USE_MPI)
+
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void *starpu_mpi_req;
+
+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
+int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
+int starpu_mpi_barrier(MPI_Comm comm);
+int starpu_mpi_initialize(void);
+int starpu_mpi_initialize_extended(int *rank, int *world_size);
+int starpu_mpi_shutdown(void);
+
+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
+void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
+
+/* Some helper functions */
+
+/* When the transfer is completed, the tag is unlocked */
+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
+
+/* Asynchronously send an array of buffers, and unlocks the tag once all of
+ * them are transmitted. */
+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // STARPU_USE_MPI
+#endif // __STARPU_MPI_H__

+ 29 - 0
mpi/libstarpumpi.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -DSTARPU_USE_DEPRECATED_API
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: libstarpu
+Requires.private:

+ 51 - 0
mpi/src/Makefile.am

@@ -0,0 +1,51 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+CC=$(MPICC)
+CCLD=$(MPICC)
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
+
+lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
+  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE) \
+  $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
+noinst_HEADERS =					\
+	starpu_mpi_private.h				\
+	starpu_mpi_fxt.h				\
+	starpu_mpi_stats.h				\
+	starpu_mpi_datatype.h
+
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
+	starpu_mpi.c					\
+	starpu_mpi_helper.c				\
+	starpu_mpi_datatype.c				\
+	starpu_mpi_insert_task.c			\
+	starpu_mpi_collective.c				\
+	starpu_mpi_stats.c
+
+
+showcheck:
+	-cat /dev/null

+ 867 - 0
mpi/src/starpu_mpi.c

@@ -0,0 +1,867 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_datatype.h>
+//#define STARPU_MPI_VERBOSE	1
+#include <starpu_mpi_private.h>
+#include <starpu_profiling.h>
+#include <starpu_mpi_stats.h>
+
+/* TODO find a better way to select the polling method (perhaps during the
+ * configuration) */
+//#define USE_STARPU_ACTIVITY	1
+
+static void submit_mpi_req(void *arg);
+static void handle_request_termination(struct _starpu_mpi_req *req);
+
+/* The list of requests that have been newly submitted by the application */
+static struct _starpu_mpi_req_list *new_requests;
+
+/* The list of detached requests that have already been submitted to MPI */
+static struct _starpu_mpi_req_list *detached_requests;
+static pthread_mutex_t detached_requests_mutex;
+
+/* Condition to wake up progression thread */
+static pthread_cond_t cond_progression;
+/* Condition to wake up waiting for all current MPI requests to finish */
+static pthread_cond_t cond_finished;
+static pthread_mutex_t mutex;
+static pthread_t progress_thread;
+static int running = 0;
+
+/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
+static pthread_mutex_t mutex_posted_requests;
+static int posted_requests = 0, newer_requests, barrier_running = 0;
+
+#define INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
+
+/*
+ *	Isend
+ */
+
+static void starpu_mpi_isend_func(struct _starpu_mpi_req *req)
+{
+	int count;
+
+        _STARPU_MPI_LOG_IN();
+
+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype, &count);
+	if (req->needs_unpacking)
+		starpu_handle_pack_data(req->data_handle, &req->ptr);
+	else
+		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+	STARPU_ASSERT(req->ptr);
+
+        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, count, &req->request);
+
+	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, count);
+
+        req->ret = MPI_Isend(req->ptr, count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+
+	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
+
+	/* somebody is perhaps waiting for the MPI request to be posted */
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	req->submitted = 1;
+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+        _STARPU_MPI_LOG_OUT();
+}
+
+static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
+							int dest, int mpi_tag, MPI_Comm comm,
+							unsigned detached, void (*callback)(void *), void *arg)
+{
+	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
+	STARPU_ASSERT(req);
+
+        _STARPU_MPI_LOG_IN();
+
+        INC_POSTED_REQUESTS(1);
+
+	/* Initialize the request structure */
+	req->submitted = 0;
+	req->completed = 0;
+	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
+
+	req->request_type = SEND_REQ;
+
+	req->data_handle = data_handle;
+	req->srcdst = dest;
+	req->mpi_tag = mpi_tag;
+	req->comm = comm;
+	req->func = starpu_mpi_isend_func;
+
+	req->detached = detached;
+	req->callback = callback;
+	req->callback_arg = arg;
+
+	/* Asynchronously request StarPU to fetch the data in main memory: when
+	 * it is available in main memory, submit_mpi_req(req) is called and
+	 * the request is actually submitted  */
+	starpu_data_acquire_cb(data_handle, STARPU_R, submit_mpi_req, (void *)req);
+
+        _STARPU_MPI_LOG_OUT();
+	return req;
+}
+
+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
+{
+        _STARPU_MPI_LOG_IN();
+	STARPU_ASSERT(public_req);
+
+	struct _starpu_mpi_req *req;
+	req = _starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 0, NULL, NULL);
+
+	STARPU_ASSERT(req);
+	*public_req = req;
+
+        _STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+/*
+ *	Isend (detached)
+ */
+
+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
+				int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+{
+        _STARPU_MPI_LOG_IN();
+	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
+
+        _STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+/*
+ *	Irecv
+ */
+
+static void starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
+{
+	int count;
+
+        _STARPU_MPI_LOG_IN();
+
+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype, &count);
+	if (req->needs_unpacking == 1)
+		req->ptr = malloc(count);
+	else
+		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
+	STARPU_ASSERT(req->ptr);
+
+	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p req %p datatype %p\n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, &req->request, req->datatype);
+
+        req->ret = MPI_Irecv(req->ptr, count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+
+	/* somebody is perhaps waiting for the MPI request to be posted */
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	req->submitted = 1;
+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+        _STARPU_MPI_LOG_OUT();
+}
+
+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
+{
+        _STARPU_MPI_LOG_IN();
+	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
+	STARPU_ASSERT(req);
+
+        INC_POSTED_REQUESTS(1);
+
+	/* Initialize the request structure */
+	req->submitted = 0;
+	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
+
+	req->request_type = RECV_REQ;
+
+	req->data_handle = data_handle;
+	req->srcdst = source;
+	req->mpi_tag = mpi_tag;
+	req->comm = comm;
+
+	req->detached = detached;
+	req->callback = callback;
+	req->callback_arg = arg;
+
+	req->func = starpu_mpi_irecv_func;
+
+	/* Asynchronously request StarPU to fetch the data in main memory: when
+	 * it is available in main memory, submit_mpi_req(req) is called and
+	 * the request is actually submitted  */
+	starpu_data_acquire_cb(data_handle, STARPU_W, submit_mpi_req, (void *)req);
+
+        _STARPU_MPI_LOG_OUT();
+	return req;
+}
+
+int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
+{
+        _STARPU_MPI_LOG_IN();
+	STARPU_ASSERT(public_req);
+
+	struct _starpu_mpi_req *req;
+	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL);
+
+	STARPU_ASSERT(req);
+	*public_req = req;
+
+        _STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+/*
+ *	Irecv (detached)
+ */
+
+int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+{
+        _STARPU_MPI_LOG_IN();
+	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
+
+        _STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+
+/*
+ *	Recv
+ */
+
+int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
+{
+	starpu_mpi_req req;
+
+        _STARPU_MPI_LOG_IN();
+	starpu_mpi_irecv(data_handle, &req, source, mpi_tag, comm);
+	starpu_mpi_wait(&req, status);
+
+        _STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+/*
+ *	Send
+ */
+
+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
+{
+	starpu_mpi_req req;
+	MPI_Status status;
+
+        _STARPU_MPI_LOG_IN();
+	memset(&status, 0, sizeof(MPI_Status));
+
+	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
+	starpu_mpi_wait(&req, &status);
+
+        _STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+/*
+ *	Wait
+ */
+
+static void starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
+{
+        _STARPU_MPI_LOG_IN();
+	/* Which is the mpi request we are waiting for ? */
+	struct _starpu_mpi_req *req = waiting_req->other_request;
+
+	req->ret = MPI_Wait(&req->request, waiting_req->status);
+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+
+	handle_request_termination(req);
+        _STARPU_MPI_LOG_OUT();
+}
+
+int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
+{
+        _STARPU_MPI_LOG_IN();
+	int ret;
+	struct _starpu_mpi_req *waiting_req = calloc(1, sizeof(struct _starpu_mpi_req));
+	STARPU_ASSERT(waiting_req);
+	struct _starpu_mpi_req *req = *public_req;
+
+        INC_POSTED_REQUESTS(1);
+
+	/* We cannot try to complete a MPI request that was not actually posted
+	 * to MPI yet. */
+	_STARPU_PTHREAD_MUTEX_LOCK(&(req->req_mutex));
+	while (!(req->submitted))
+		_STARPU_PTHREAD_COND_WAIT(&(req->req_cond), &(req->req_mutex));
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
+
+	/* Initialize the request structure */
+	_STARPU_PTHREAD_MUTEX_INIT(&(waiting_req->req_mutex), NULL);
+	_STARPU_PTHREAD_COND_INIT(&(waiting_req->req_cond), NULL);
+	waiting_req->status = status;
+	waiting_req->other_request = req;
+	waiting_req->func = starpu_mpi_wait_func;
+	waiting_req->request_type = WAIT_REQ;
+
+	submit_mpi_req(waiting_req);
+
+	/* We wait for the MPI request to finish */
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	while (!req->completed)
+		_STARPU_PTHREAD_COND_WAIT(&req->req_cond, &req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+
+	ret = req->ret;
+
+	/* The internal request structure was automatically allocated */
+	*public_req = NULL;
+	free(req);
+
+        //free(waiting_req);
+        _STARPU_MPI_LOG_OUT();
+	return ret;
+}
+
+/*
+ * 	Test
+ */
+
+static void starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
+{
+        _STARPU_MPI_LOG_IN();
+	/* Which is the mpi request we are testing for ? */
+	struct _starpu_mpi_req *req = testing_req->other_request;
+
+        _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
+	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
+
+	if (*testing_req->flag)
+	{
+		testing_req->ret = req->ret;
+		handle_request_termination(req);
+	}
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&testing_req->req_mutex);
+	testing_req->completed = 1;
+	_STARPU_PTHREAD_COND_SIGNAL(&testing_req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->req_mutex);
+        _STARPU_MPI_LOG_OUT();
+}
+
+int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
+{
+        _STARPU_MPI_LOG_IN();
+	int ret = 0;
+
+	STARPU_ASSERT(public_req);
+
+	struct _starpu_mpi_req *req = *public_req;
+
+	STARPU_ASSERT(!req->detached);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	unsigned submitted = req->submitted;
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+
+	if (submitted)
+	{
+		struct _starpu_mpi_req *testing_req = calloc(1, sizeof(struct _starpu_mpi_req));
+                STARPU_ASSERT(testing_req);
+                //		memset(testing_req, 0, sizeof(struct _starpu_mpi_req));
+
+		/* Initialize the request structure */
+		_STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
+		_STARPU_PTHREAD_COND_INIT(&(testing_req->req_cond), NULL);
+		testing_req->flag = flag;
+		testing_req->status = status;
+		testing_req->other_request = req;
+		testing_req->func = starpu_mpi_test_func;
+		testing_req->completed = 0;
+                testing_req->request_type = TEST_REQ;
+
+                INC_POSTED_REQUESTS(1);
+                submit_mpi_req(testing_req);
+
+		/* We wait for the test request to finish */
+		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
+		while (!(testing_req->completed))
+                        _STARPU_PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&(testing_req->req_mutex));
+
+		ret = testing_req->ret;
+
+		if (*(testing_req->flag))
+		{
+			/* The request was completed so we free the internal
+			 * request structure which was automatically allocated
+			 * */
+			*public_req = NULL;
+			free(req);
+		}
+	}
+	else {
+		*flag = 0;
+	}
+
+        _STARPU_MPI_LOG_OUT();
+	return ret;
+}
+
+/*
+ *	Barrier
+ */
+
+static void starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
+{
+        _STARPU_MPI_LOG_IN();
+
+	barrier_req->ret = MPI_Barrier(barrier_req->comm);
+        STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
+
+	handle_request_termination(barrier_req);
+        _STARPU_MPI_LOG_OUT();
+}
+
+int starpu_mpi_barrier(MPI_Comm comm)
+{
+        _STARPU_MPI_LOG_IN();
+	int ret;
+	struct _starpu_mpi_req *barrier_req = calloc(1, sizeof(struct _starpu_mpi_req));
+	STARPU_ASSERT(barrier_req);
+
+	/* First wait for *both* all tasks and MPI requests to finish, in case
+	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
+	 */
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
+	barrier_running = 1;
+	do {
+		while (posted_requests)
+			/* Wait for all current MPI requests to finish */
+			_STARPU_PTHREAD_COND_WAIT(&cond_finished, &mutex);
+		/* No current request, clear flag */
+		newer_requests = 0;
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		/* Now wait for all tasks */
+		starpu_task_wait_for_all();
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		/* Check newer_requests again, in case some MPI requests
+		 * triggered by tasks completed and triggered tasks between
+		 * wait_for_all finished and we take the lock */
+	} while (posted_requests || newer_requests);
+	barrier_running = 0;
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	/* Initialize the request structure */
+	_STARPU_PTHREAD_MUTEX_INIT(&(barrier_req->req_mutex), NULL);
+	_STARPU_PTHREAD_COND_INIT(&(barrier_req->req_cond), NULL);
+	barrier_req->func = starpu_mpi_barrier_func;
+	barrier_req->request_type = BARRIER_REQ;
+	barrier_req->comm = comm;
+
+        INC_POSTED_REQUESTS(1);
+	submit_mpi_req(barrier_req);
+
+	/* We wait for the MPI request to finish */
+	_STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
+	while (!barrier_req->completed)
+		_STARPU_PTHREAD_COND_WAIT(&barrier_req->req_cond, &barrier_req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->req_mutex);
+
+	ret = barrier_req->ret;
+
+        //free(waiting_req);
+        _STARPU_MPI_LOG_OUT();
+	return ret;
+}
+
+/*
+ *	Requests
+ */
+
+#ifdef STARPU_MPI_VERBOSE
+static char *starpu_mpi_request_type(unsigned request_type)
+{
+        switch (request_type)
+                {
+                case SEND_REQ: return "send";
+                case RECV_REQ: return "recv";
+                case WAIT_REQ: return "wait";
+                case TEST_REQ: return "test";
+                case BARRIER_REQ: return "barrier";
+                default: return "unknown request type";
+                }
+}
+#endif
+
+static void handle_request_termination(struct _starpu_mpi_req *req)
+{
+        _STARPU_MPI_LOG_IN();
+
+	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
+        if (req->request_type != BARRIER_REQ) {
+		if (req->needs_unpacking)
+			starpu_handle_unpack_data(req->data_handle, req->ptr);
+		else
+			MPI_Type_free(&req->datatype);
+                starpu_data_release(req->data_handle);
+        }
+
+	if (req->request_type == RECV_REQ)
+	{
+		TRACE_MPI_IRECV_END(req->srcdst, req->mpi_tag);
+	}
+
+	/* Execute the specified callback, if any */
+	if (req->callback)
+		req->callback(req->callback_arg);
+
+	/* tell anyone potentiallly waiting on the request that it is
+	 * terminated now */
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	req->completed = 1;
+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+        _STARPU_MPI_LOG_OUT();
+}
+
+static void submit_mpi_req(void *arg)
+{
+        _STARPU_MPI_LOG_IN();
+	struct _starpu_mpi_req *req = arg;
+
+        INC_POSTED_REQUESTS(-1);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	_starpu_mpi_req_list_push_front(new_requests, req);
+	newer_requests = 1;
+        _STARPU_MPI_DEBUG("Pushing new request type %d\n", req->request_type);
+	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+        _STARPU_MPI_LOG_OUT();
+}
+
+/*
+ *	Scheduler hook
+ */
+
+#ifdef USE_STARPU_ACTIVITY
+static unsigned progression_hook_func(void *arg __attribute__((unused)))
+{
+	unsigned may_block = 1;
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	if (!_starpu_mpi_req_list_empty(detached_requests))
+	{
+		_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
+		may_block = 0;
+	}
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	return may_block;
+}
+#endif
+
+/*
+ *	Progression loop
+ */
+
+static void test_detached_requests(void)
+{
+        _STARPU_MPI_LOG_IN();
+	int flag;
+	MPI_Status status;
+	struct _starpu_mpi_req *req, *next_req;
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
+
+	for (req = _starpu_mpi_req_list_begin(detached_requests);
+		req != _starpu_mpi_req_list_end(detached_requests);
+		req = next_req)
+	{
+		next_req = _starpu_mpi_req_list_next(req);
+
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+
+                //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
+		req->ret = MPI_Test(&req->request, &flag, &status);
+		STARPU_ASSERT(req->ret == MPI_SUCCESS);
+
+		if (flag)
+		{
+			handle_request_termination(req);
+		}
+
+		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
+
+		if (flag)
+			_starpu_mpi_req_list_erase(detached_requests, req);
+
+#ifdef STARPU_DEVEL
+#warning TODO fix memleak
+#endif
+		/* Detached requests are automatically allocated by the lib */
+		//if (req->detached)
+		//	free(req);
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+        _STARPU_MPI_LOG_OUT();
+}
+
+static void handle_new_request(struct _starpu_mpi_req *req)
+{
+        _STARPU_MPI_LOG_IN();
+	STARPU_ASSERT(req);
+
+	/* submit the request to MPI */
+        _STARPU_MPI_DEBUG("Handling new request type %d\n", req->request_type);
+	req->func(req);
+
+	if (req->detached)
+	{
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		_starpu_mpi_req_list_push_front(detached_requests, req);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+		starpu_wake_all_blocked_workers();
+
+		/* put the submitted request into the list of pending requests
+		 * so that it can be handled by the progression mechanisms */
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+	}
+        _STARPU_MPI_LOG_OUT();
+}
+
+static void *progress_thread_func(void *arg)
+{
+        int initialize_mpi = *((int *) arg);
+
+        _STARPU_DEBUG("Initialize mpi: %d\n", initialize_mpi);
+
+        if (initialize_mpi) {
+#ifdef STARPU_DEVEL
+#warning get real argc and argv from the application
+#endif
+                int argc = 0;
+                char **argv = NULL;
+                int thread_support;
+                _STARPU_DEBUG("Calling MPI_Init_thread\n");
+                if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
+                        fprintf(stderr,"MPI_Init_thread failed\n");
+                        exit(1);
+                }
+                if (thread_support == MPI_THREAD_FUNNELED)
+                        fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+                if (thread_support < MPI_THREAD_FUNNELED)
+                        fprintf(stderr,"Warning: MPI does not have thread support!\n");
+        }
+
+	/* notify the main thread that the progression thread is ready */
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	running = 1;
+	_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests))) {
+		/* shall we block ? */
+		unsigned block = _starpu_mpi_req_list_empty(new_requests);
+
+#ifndef USE_STARPU_ACTIVITY
+		block = block && _starpu_mpi_req_list_empty(detached_requests);
+#endif
+
+		if (block)
+		{
+                        _STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
+			if (barrier_running)
+				/* Tell mpi_barrier */
+				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
+			_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
+		}
+
+		/* test whether there are some terminated "detached request" */
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		test_detached_requests();
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+
+		/* get one request */
+		struct _starpu_mpi_req *req;
+		while (!_starpu_mpi_req_list_empty(new_requests))
+		{
+			req = _starpu_mpi_req_list_pop_back(new_requests);
+
+			/* handling a request is likely to block for a while
+			 * (on a sync_data_with_mem call), we want to let the
+			 * application submit requests in the meantime, so we
+			 * release the lock.  */
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+			handle_new_request(req);
+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		}
+	}
+
+	STARPU_ASSERT(_starpu_mpi_req_list_empty(detached_requests));
+	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
+        STARPU_ASSERT(posted_requests == 0);
+
+        if (initialize_mpi) {
+                _STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
+                MPI_Finalize();
+        }
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	return NULL;
+}
+
+/*
+ *	(De)Initialization methods
+ */
+
+#ifdef USE_STARPU_ACTIVITY
+static int hookid = - 1;
+#endif
+
+static void _starpu_mpi_add_sync_point_in_fxt(void)
+{
+#ifdef STARPU_USE_FXT
+	int rank;
+	int worldsize;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
+
+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	/* We generate a "unique" key so that we can make sure that different
+	 * FxT traces come from the same MPI run. */
+	int random_number;
+
+	/* XXX perhaps we don't want to generate a new seed if the application
+	 * specified some reproductible behaviour ? */
+	if (rank == 0)
+	{
+		srand(time(NULL));
+		random_number = rand();
+	}
+
+	MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+	TRACE_MPI_BARRIER(rank, worldsize, random_number);
+
+        _STARPU_MPI_DEBUG("unique key %x\n", random_number);
+#endif
+}
+
+static
+int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
+{
+#ifndef STARPU_MPI_CACHE
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --disable-mpi-cache\n");
+#endif
+
+	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
+	_STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
+	new_requests = _starpu_mpi_req_list_new();
+
+	_STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
+	detached_requests = _starpu_mpi_req_list_new();
+
+        _STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
+
+	_STARPU_PTHREAD_CREATE(&progress_thread, NULL,
+			       progress_thread_func, (void *)&initialize_mpi);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	while (!running)
+		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+        if (rank && world_size) {
+                _STARPU_DEBUG("Calling MPI_Comm_rank\n");
+                MPI_Comm_rank(MPI_COMM_WORLD, rank);
+                MPI_Comm_size(MPI_COMM_WORLD, world_size);
+        }
+
+#ifdef STARPU_USE_FXT
+	int prank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &prank);
+	starpu_set_profiling_id(prank);
+#endif //STARPU_USE_FXT
+
+#ifdef USE_STARPU_ACTIVITY
+	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
+	STARPU_ASSERT(hookid >= 0);
+#endif
+
+	_starpu_mpi_add_sync_point_in_fxt();
+	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
+	return 0;
+}
+
+int starpu_mpi_initialize(void)
+{
+        return _starpu_mpi_initialize(0, NULL, NULL);
+}
+
+int starpu_mpi_initialize_extended(int *rank, int *world_size)
+{
+        return _starpu_mpi_initialize(1, rank, world_size);
+}
+
+int starpu_mpi_shutdown(void)
+{
+	void *value;
+	int rank;
+
+	/* We need to get the  rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	/* kill the progression thread */
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	running = 0;
+	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+	pthread_join(progress_thread, &value);
+
+#ifdef USE_STARPU_ACTIVITY
+	starpu_progression_hook_deregister(hookid);
+#endif
+
+	/* free the request queues */
+	_starpu_mpi_req_list_delete(detached_requests);
+	_starpu_mpi_req_list_delete(new_requests);
+
+	_starpu_mpi_comm_amounts_display(rank);
+	_starpu_mpi_comm_amounts_free();
+
+	return 0;
+}
+

+ 78 - 0
mpi/src/starpu_mpi_collective.c

@@ -0,0 +1,78 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <mpi.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
+{
+	int rank;
+	int x;
+
+	MPI_Comm_rank(comm, &rank);
+
+	for(x = 0; x < count ;  x++)
+	{
+		if (data_handles[x])
+		{
+			int owner = starpu_data_get_rank(data_handles[x]);
+			int mpi_tag = starpu_data_get_tag(data_handles[x]);
+			STARPU_ASSERT(mpi_tag >= 0);
+			if ((rank == root) && (owner != root))
+			{
+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
+				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
+			}
+			if ((rank != root) && (owner == rank))
+			{
+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
+				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
+			}
+		}
+	}
+	return 0;
+}
+
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
+{
+	int rank;
+	int x;
+
+	MPI_Comm_rank(comm, &rank);
+
+	for(x = 0; x < count ;  x++)
+	{
+		if (data_handles[x])
+		{
+			int owner = starpu_data_get_rank(data_handles[x]);
+			int mpi_tag = starpu_data_get_tag(data_handles[x]);
+			STARPU_ASSERT(mpi_tag >= 0);
+			if ((rank == root) && (owner != root))
+			{
+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
+				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
+			}
+			if ((rank != root) && (owner == rank))
+			{
+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
+				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
+			}
+		}
+	}
+	return 0;
+}
+

+ 149 - 0
mpi/src/starpu_mpi_datatype.c

@@ -0,0 +1,149 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi_datatype.h>
+
+typedef int (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
+
+/*
+ * 	Matrix
+ */
+
+static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	unsigned nx = starpu_matrix_get_nx(data_handle);
+	unsigned ny = starpu_matrix_get_ny(data_handle);
+	unsigned ld = starpu_matrix_get_local_ld(data_handle);
+	size_t elemsize = starpu_matrix_get_elemsize(data_handle);
+
+	ret = MPI_Type_vector(ny, nx*elemsize, ld*elemsize, MPI_BYTE, datatype);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	return 0;
+}
+
+/*
+ * 	Block
+ */
+
+static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	unsigned nx = starpu_block_get_nx(data_handle);
+	unsigned ny = starpu_block_get_ny(data_handle);
+	unsigned nz = starpu_block_get_nz(data_handle);
+	unsigned ldy = starpu_block_get_local_ldy(data_handle);
+	unsigned ldz = starpu_block_get_local_ldz(data_handle);
+	size_t elemsize = starpu_block_get_elemsize(data_handle);
+
+	MPI_Datatype datatype_2dlayer;
+	ret = MPI_Type_vector(ny, nx*elemsize, ldy*elemsize, MPI_BYTE, &datatype_2dlayer);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	ret = MPI_Type_commit(&datatype_2dlayer);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	ret = MPI_Type_hvector(nz, 1, ldz*elemsize, datatype_2dlayer, datatype);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	return 0;
+}
+
+/*
+ * 	Vector
+ */
+
+static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	unsigned nx = starpu_vector_get_nx(data_handle);
+	size_t elemsize = starpu_vector_get_elemsize(data_handle);
+
+	ret = MPI_Type_contiguous(nx*elemsize, MPI_BYTE, datatype);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	return 0;
+}
+
+/*
+ * 	Variable
+ */
+
+static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	size_t elemsize = starpu_variable_get_elemsize(data_handle);
+
+	ret = MPI_Type_contiguous(elemsize, MPI_BYTE, datatype);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT(ret == MPI_SUCCESS);
+
+	return 0;
+}
+
+/*
+ *	Generic
+ */
+
+static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
+{
+	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
+	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
+	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
+	[STARPU_CSR_INTERFACE_ID]	= NULL,
+	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
+	[STARPU_VOID_INTERFACE_ID]      = NULL,
+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
+};
+
+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *count)
+{
+	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
+
+	if (id <= STARPU_MULTIFORMAT_INTERFACE_ID)
+	{
+		handle_to_datatype_func func = handle_to_datatype_funcs[id];
+		STARPU_ASSERT(func);
+		func(data_handle, datatype);
+		*count = 1;
+		return 0;
+	}
+	else
+	{
+		/* The datatype is not predefined by StarPU */
+		*count = starpu_handle_get_size(data_handle);
+		*datatype = MPI_BYTE;
+		return 1;
+	}
+}

+ 33 - 0
mpi/src/starpu_mpi_datatype.h

@@ -0,0 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_DATATYPE_H__
+#define __STARPU_MPI_DATATYPE_H__
+
+#include <starpu_mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *count);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_DATATYPE_H__

+ 45 - 0
mpi/src/starpu_mpi_fxt.h

@@ -0,0 +1,45 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_FXT_H__
+#define __STARPU_MPI_FXT_H__
+
+#include <starpu.h>
+#include <common/config.h>
+#include <common/fxt.h>
+
+#define FUT_MPI_BARRIER		0x5201
+#define FUT_MPI_ISEND		0x5202
+#define FUT_MPI_IRECV_END	0x5203
+
+#ifdef STARPU_USE_FXT
+#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
+	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
+#define TRACE_MPI_ISEND(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(FUT_MPI_ISEND, (dest), (mpi_tag), (size), _starpu_gettid());
+#define TRACE_MPI_IRECV_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_IRECV_END, (src), (mpi_tag), _starpu_gettid());
+#define TRACE
+#else
+#define TRACE_MPI_BARRIER(a, b, c)	do {} while(0);
+#define TRACE_MPI_ISEND(a, b, c)	do {} while(0);
+#define TRACE_MPI_IRECV_END(a, b)	do {} while(0);
+#endif
+
+
+
+#endif // __STARPU_MPI_FXT_H__

+ 104 - 0
mpi/src/starpu_mpi_helper.c

@@ -0,0 +1,104 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+
+static void starpu_mpi_unlock_tag_callback(void *arg)
+{
+	starpu_tag_t *tagptr = arg;
+
+	starpu_tag_notify_from_apps(*tagptr);
+
+	free(tagptr);
+}
+
+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle,
+				int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
+{
+	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
+	*tagptr = tag;
+	
+	return starpu_mpi_isend_detached(data_handle, dest, mpi_tag, comm,
+						starpu_mpi_unlock_tag_callback, tagptr);
+}
+
+
+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
+{
+	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
+	*tagptr = tag;
+	
+	return starpu_mpi_irecv_detached(data_handle, source, mpi_tag, comm,
+						starpu_mpi_unlock_tag_callback, tagptr);
+}
+
+struct arg_array {
+	int array_size;
+	starpu_tag_t tag;
+};
+
+static void starpu_mpi_array_unlock_callback(void *_arg)
+{
+	struct arg_array *arg = _arg;
+
+	int remaining = STARPU_ATOMIC_ADD(&arg->array_size, -1);
+
+	if (remaining == 0)
+	{
+		starpu_tag_notify_from_apps(arg->tag);
+		free(arg);
+	}
+}
+
+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
+		starpu_data_handle_t *data_handle, int *dest, int *mpi_tag,
+		MPI_Comm *comm, starpu_tag_t tag)
+{
+	struct arg_array *arg = malloc(sizeof(struct arg_array));
+
+	arg->array_size = array_size;
+	arg->tag = tag;
+
+	unsigned elem;
+	for (elem = 0; elem < array_size; elem++)
+	{
+		starpu_mpi_isend_detached(data_handle[elem], dest[elem],
+				mpi_tag[elem], comm[elem],
+				starpu_mpi_array_unlock_callback, arg);
+	}
+
+	return 0;
+}
+
+
+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
+{
+	struct arg_array *arg = malloc(sizeof(struct arg_array));
+
+	arg->array_size = array_size;
+	arg->tag = tag;
+
+	unsigned elem;
+	for (elem = 0; elem < array_size; elem++)
+	{
+		starpu_mpi_irecv_detached(data_handle[elem], source[elem],
+				mpi_tag[elem], comm[elem],
+				starpu_mpi_array_unlock_callback, arg);
+	}
+
+	return 0;
+}

+ 632 - 0
mpi/src/starpu_mpi_insert_task.c

@@ -0,0 +1,632 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdarg.h>
+#include <mpi.h>
+
+#include <starpu.h>
+#include <starpu_data.h>
+#include <common/utils.h>
+#include <common/uthash.h>
+#include <util/starpu_insert_task_utils.h>
+#include <datawizard/coherency.h>
+
+//#define STARPU_MPI_VERBOSE 1
+#include <starpu_mpi_private.h>
+
+#ifdef STARPU_MPI_CACHE
+/* Whether we are allowed to keep copies of remote data. */
+struct _starpu_data_entry
+{
+	UT_hash_handle hh;
+	void *data;
+};
+
+struct _starpu_data_entry **sent_data = NULL;
+struct _starpu_data_entry **received_data = NULL;
+#endif /* STARPU_MPI_CACHE */
+
+static void _starpu_mpi_tables_init()
+{
+#ifdef STARPU_MPI_CACHE
+	if (sent_data == NULL) {
+		int nb_nodes;
+		int i;
+
+		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
+		_STARPU_MPI_DEBUG("Initialising htable for cache\n");
+		sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
+		for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
+		received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
+		for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
+	}
+#endif /* STARPU_MPI_CACHE */
+}
+
+static
+int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
+{
+	if (data && mode & STARPU_R) {
+		struct starpu_data_interface_ops *ops;
+		int rank = starpu_data_get_rank(data);
+
+		ops = data->ops;
+		size_on_nodes[rank] += ops->get_size(data);
+	}
+
+	if (mode & STARPU_W) {
+		if (!data) {
+			/* We don't have anything allocated for this.
+			 * The application knows we won't do anything
+			 * about this task */
+			/* Yes, the app could actually not call
+			 * insert_task at all itself, this is just a
+			 * safeguard. */
+			_STARPU_MPI_DEBUG("oh oh\n");
+			_STARPU_MPI_LOG_OUT();
+			return -EINVAL;
+		}
+		int mpi_rank = starpu_data_get_rank(data);
+		if (mpi_rank == me) {
+			if (*do_execute == 0) {
+				*inconsistent_execute = 1;
+			}
+			else {
+				*do_execute = 1;
+			}
+		}
+		else if (mpi_rank != -1) {
+			if (*do_execute == 1) {
+				*inconsistent_execute = 1;
+			}
+			else {
+				*do_execute = 0;
+				*dest = mpi_rank;
+				/* That's the rank which needs the data to be sent to */
+			}
+		}
+		else {
+			_STARPU_ERROR("rank invalid\n");
+		}
+	}
+	return 0;
+}
+
+static
+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
+{
+	if (data && mode & STARPU_R) {
+		int mpi_rank = starpu_data_get_rank(data);
+		int mpi_tag = starpu_data_get_tag(data);
+		if(mpi_rank == -1) {
+			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+			STARPU_ABORT();
+		}
+		if(mpi_tag == -1) {
+			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+			STARPU_ABORT();
+		}
+		/* The task needs to read this data */
+		if (do_execute && mpi_rank != me && mpi_rank != -1) {
+			/* I will have to execute but I don't have the data, receive */
+#ifdef STARPU_MPI_CACHE
+			struct _starpu_data_entry *already_received;
+			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
+			if (already_received == NULL) {
+				struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
+				entry->data = data;
+				HASH_ADD_PTR(received_data[mpi_rank], data, entry);
+			}
+			else {
+				_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
+			}
+			if (!already_received)
+#endif
+			{
+				_STARPU_MPI_DEBUG("Receive data %p from %d\n", data, mpi_rank);
+				starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
+			}
+		}
+		if (!do_execute && mpi_rank == me) {
+			/* Somebody else will execute it, and I have the data, send it. */
+#ifdef STARPU_MPI_CACHE
+			struct _starpu_data_entry *already_sent;
+			HASH_FIND_PTR(sent_data[dest], &data, already_sent);
+			if (already_sent == NULL) {
+				struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
+				entry->data = data;
+				HASH_ADD_PTR(sent_data[dest], data, entry);
+				_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
+			}
+			else {
+				_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
+			}
+			if (!already_sent)
+#endif
+			{
+				_STARPU_MPI_DEBUG("Send data %p to %d\n", data, dest);
+				starpu_mpi_isend_detached(data, dest, mpi_tag, comm, NULL, NULL);
+			}
+		}
+	}
+}
+
+static
+void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int xrank, int dest, int do_execute, MPI_Comm comm)
+{
+	if (mode & STARPU_W) {
+		int mpi_rank = starpu_data_get_rank(data);
+		int mpi_tag = starpu_data_get_tag(data);
+		if(mpi_rank == -1) {
+			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+			STARPU_ABORT();
+		}
+		if(mpi_tag == -1) {
+			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+			STARPU_ABORT();
+		}
+		if (mpi_rank == me) {
+			if (xrank != -1 && me != xrank) {
+				_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
+				starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
+			}
+		}
+		else if (do_execute) {
+			_STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
+			starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
+		}
+	}
+}
+
+void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
+{
+#ifdef STARPU_MPI_CACHE
+	if (mode & STARPU_W) {
+		if (do_execute) {
+			/* Note that all copies I've sent to neighbours are now invalid */
+			int n, size;
+			MPI_Comm_size(comm, &size);
+			for(n=0 ; n<size ; n++) {
+				struct _starpu_data_entry *already_sent;
+				HASH_FIND_PTR(sent_data[n], &data, already_sent);
+				if (already_sent) {
+					_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
+					HASH_DEL(sent_data[n], already_sent);
+				}
+			}
+		}
+		else {
+			int mpi_rank = starpu_data_get_rank(data);
+			struct _starpu_data_entry *already_received;
+			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
+			if (already_received) {
+				/* Somebody else will write to the data, so discard our cached copy if any */
+				/* TODO: starpu_mpi could just remember itself. */
+				_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
+				HASH_DEL(received_data[mpi_rank], already_received);
+				starpu_data_invalidate_submit(data);
+			}
+		}
+	}
+#else
+	/* We allocated a temporary buffer for the received data, now drop it */
+	if ((mode & STARPU_R) && do_execute) {
+		int mpi_rank = starpu_data_get_rank(data);
+		if (mpi_rank != me && mpi_rank != -1) {
+			starpu_data_invalidate_submit(data);
+		}
+	}
+#endif
+}
+
+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+{
+	int arg_type;
+	va_list varg_list;
+	int me, do_execute, xrank, nb_nodes;
+	size_t *size_on_nodes;
+	size_t arg_buffer_size = 0;
+	char *arg_buffer;
+	int dest=0, inconsistent_execute;
+	int current_data = 0;
+
+	_STARPU_MPI_LOG_IN();
+
+	MPI_Comm_rank(comm, &me);
+	MPI_Comm_size(comm, &nb_nodes);
+
+	size_on_nodes = (size_t *)calloc(1, nb_nodes * sizeof(size_t));
+
+	_starpu_mpi_tables_init();
+
+	/* Get the number of buffers and the size of the arguments */
+	va_start(varg_list, codelet);
+	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
+
+	va_start(varg_list, codelet);
+	_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
+
+	/* Find out whether we are to execute the data because we own the data to be written to. */
+	inconsistent_execute = 0;
+	do_execute = -1;
+	xrank = -1;
+	va_start(varg_list, codelet);
+	while ((arg_type = va_arg(varg_list, int)) != 0) {
+		if (arg_type==STARPU_EXECUTE_ON_NODE) {
+			xrank = va_arg(varg_list, int);
+			_STARPU_MPI_DEBUG("Executing on node %d\n", xrank);
+			do_execute = 1;
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+			xrank = starpu_data_get_rank(data);
+			_STARPU_MPI_DEBUG("Executing on data node %d\n", xrank);
+			STARPU_ASSERT(xrank <= nb_nodes);
+			do_execute = 1;
+		}
+		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
+			int ret = _starpu_mpi_find_executee_node(data, mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
+			if (ret == -EINVAL)
+			{
+				free(size_on_nodes);
+				return ret;
+			}
+			current_data ++;
+		}
+		else if (arg_type == STARPU_DATA_ARRAY)
+		{
+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
+			int nb_handles = va_arg(varg_list, int);
+			int i;
+			for(i=0 ; i<nb_handles ; i++)
+			{
+				enum starpu_access_mode mode = codelet->modes[current_data];
+				int ret = _starpu_mpi_find_executee_node(datas[i], mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
+				if (ret == -EINVAL)
+				{
+					free(size_on_nodes);
+					return ret;
+				}
+				current_data ++;
+			}
+		}
+		else if (arg_type==STARPU_VALUE) {
+			va_arg(varg_list, void *);
+			va_arg(varg_list, size_t);
+		}
+		else if (arg_type==STARPU_CALLBACK) {
+			va_arg(varg_list, void (*)(void *));
+		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+			va_arg(varg_list, void (*)(void *));
+			va_arg(varg_list, void *);
+		}
+		else if (arg_type==STARPU_CALLBACK_ARG) {
+			va_arg(varg_list, void *);
+		}
+		else if (arg_type==STARPU_PRIORITY) {
+			va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+			va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			va_arg(varg_list, int);
+		}
+	}
+	va_end(varg_list);
+
+	if (do_execute == -1) {
+		int i;
+		size_t max_size = 0;
+		for(i=0 ; i<nb_nodes ; i++) {
+			if (size_on_nodes[i] > max_size)
+			{
+				max_size = size_on_nodes[i];
+				xrank = i;
+			}
+		}
+		free(size_on_nodes);
+		if (xrank != -1) {
+			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
+			do_execute = 1;
+		}
+	}
+
+	STARPU_ASSERT_MSG(do_execute != -1, "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
+
+	if (inconsistent_execute == 1) {
+		if (xrank == -1) {
+			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
+			free(size_on_nodes);
+			return -EINVAL;
+		}
+		else {
+			do_execute = (me == xrank);
+			dest = xrank;
+		}
+	}
+	else if (xrank != -1) {
+		do_execute = (me == xrank);
+		dest = xrank;
+	}
+
+	/* Send and receive data as requested */
+	va_start(varg_list, codelet);
+	current_data = 0;
+	while ((arg_type = va_arg(varg_list, int)) != 0) {
+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
+
+			_starpu_mpi_exchange_data_before_execution(data, mode, me, dest, do_execute, comm);
+			current_data ++;
+
+		}
+		else if (arg_type == STARPU_DATA_ARRAY)
+		{
+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
+			int nb_handles = va_arg(varg_list, int);
+			int i;
+
+			for(i=0 ; i<nb_handles ; i++)
+			{
+				_starpu_mpi_exchange_data_before_execution(datas[i], codelet->modes[current_data], me, dest, do_execute, comm);
+				current_data++;
+			}
+		}
+		else if (arg_type==STARPU_VALUE) {
+			va_arg(varg_list, void *);
+			va_arg(varg_list, size_t);
+		}
+		else if (arg_type==STARPU_CALLBACK) {
+			va_arg(varg_list, void (*)(void *));
+		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+			va_arg(varg_list, void (*)(void *));
+			va_arg(varg_list, void *);
+		}
+		else if (arg_type==STARPU_CALLBACK_ARG) {
+			va_arg(varg_list, void *);
+		}
+		else if (arg_type==STARPU_PRIORITY) {
+			va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+			va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			va_arg(varg_list, starpu_data_handle_t);
+		}
+	}
+	va_end(varg_list);
+
+	if (do_execute) {
+		_STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
+		va_start(varg_list, codelet);
+		struct starpu_task *task = starpu_task_create();
+		int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
+		_STARPU_MPI_DEBUG("ret: %d\n", ret);
+		STARPU_ASSERT(ret==0);
+	}
+
+	if (inconsistent_execute) {
+		va_start(varg_list, codelet);
+		current_data = 0;
+		while ((arg_type = va_arg(varg_list, int)) != 0) {
+			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+				starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+				enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
+
+				_starpu_mpi_exchange_data_after_execution(data, mode, me, xrank, dest, do_execute, comm);
+				current_data++;
+			}
+			else if (arg_type == STARPU_DATA_ARRAY)
+			{
+				starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
+				int nb_handles = va_arg(varg_list, int);
+				int i;
+
+				for(i=0 ; i<nb_handles ; i++)
+				{
+					_starpu_mpi_exchange_data_after_execution(datas[i], codelet->modes[current_data], me, xrank, dest, do_execute, comm);
+					current_data++;
+				}
+			}
+			else if (arg_type==STARPU_VALUE) {
+				va_arg(varg_list, void *);
+				va_arg(varg_list, size_t);
+			}
+			else if (arg_type==STARPU_CALLBACK) {
+				va_arg(varg_list, void (*)(void *));
+			}
+			else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+				va_arg(varg_list, void (*)(void *));
+				va_arg(varg_list, void *);
+			}
+			else if (arg_type==STARPU_CALLBACK_ARG) {
+				va_arg(varg_list, void *);
+			}
+			else if (arg_type==STARPU_PRIORITY) {
+				va_arg(varg_list, int);
+			}
+			else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+				va_arg(varg_list, int);
+			}
+			else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+				va_arg(varg_list, starpu_data_handle_t);
+			}
+		}
+		va_end(varg_list);
+	}
+
+	va_start(varg_list, codelet);
+	current_data = 0;
+	while ((arg_type = va_arg(varg_list, int)) != 0) {
+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH || arg_type == STARPU_REDUX) {
+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
+
+			_starpu_mpi_clear_data_after_execution(data, mode, me, do_execute, comm);
+			current_data++;
+		}
+		else if (arg_type == STARPU_DATA_ARRAY)
+		{
+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
+			int nb_handles = va_arg(varg_list, int);
+			int i;
+
+			for(i=0 ; i<nb_handles ; i++)
+			{
+				_starpu_mpi_clear_data_after_execution(datas[i], codelet->modes[current_data], me, do_execute, comm);
+				current_data++;
+			}
+		}
+		else if (arg_type==STARPU_VALUE) {
+			va_arg(varg_list, void *);
+			va_arg(varg_list, size_t);
+		}
+		else if (arg_type==STARPU_CALLBACK) {
+			va_arg(varg_list, void (*)(void *));
+		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+			va_arg(varg_list, void (*)(void *));
+			va_arg(varg_list, void *);
+		}
+		else if (arg_type==STARPU_CALLBACK_ARG) {
+			va_arg(varg_list, void *);
+		}
+		else if (arg_type==STARPU_PRIORITY) {
+			va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
+			va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
+			va_arg(varg_list, starpu_data_handle_t);
+		}
+	}
+
+	va_end(varg_list);
+	_STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
+{
+	int me, rank, tag;
+
+	rank = starpu_data_get_rank(data_handle);
+	tag = starpu_data_get_tag(data_handle);
+	if(rank == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+		STARPU_ABORT();
+	}
+	if(tag == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+		STARPU_ABORT();
+	}
+	MPI_Comm_rank(comm, &me);
+
+	if (node == rank) return;
+
+	if (me == node)
+	{
+		starpu_mpi_irecv_detached(data_handle, rank, tag, comm, callback, arg);
+	}
+	else if (me == rank)
+	{
+		starpu_mpi_isend_detached(data_handle, node, tag, comm, NULL, NULL);
+	}
+}
+
+void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
+{
+	int me, rank, tag;
+
+	rank = starpu_data_get_rank(data_handle);
+	tag = starpu_data_get_tag(data_handle);
+	if(rank == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+		STARPU_ABORT();
+	}
+	if(tag == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+		STARPU_ABORT();
+	}
+	MPI_Comm_rank(comm, &me);
+
+	if (node == rank) return;
+
+	if (me == node)
+	{
+		MPI_Status status;
+		starpu_mpi_recv(data_handle, rank, tag, comm, &status);
+	}
+	else if (me == rank)
+	{
+		starpu_mpi_send(data_handle, node, tag, comm);
+	}
+}
+
+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	int me, rank, tag, nb_nodes;
+
+	rank = starpu_data_get_rank(data_handle);
+	tag = starpu_data_get_tag(data_handle);
+	if(rank == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
+		STARPU_ABORT();
+	}
+	if(tag == -1) {
+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
+		STARPU_ABORT();
+	}
+
+	MPI_Comm_rank(comm, &me);
+	MPI_Comm_size(comm, &nb_nodes);
+
+	_STARPU_MPI_DEBUG("Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
+
+	// need to count how many nodes have the data in redux mode
+	if (me == rank) {
+		int i;
+
+		for(i=0 ; i<nb_nodes ; i++) {
+			if (i != rank) {
+				starpu_data_handle_t new_handle;
+
+				starpu_data_register_same(&new_handle, data_handle);
+
+				_STARPU_MPI_DEBUG("Receiving redux handle from %d in %p ...\n", i, new_handle);
+
+				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
+				starpu_insert_task(data_handle->redux_cl,
+						STARPU_RW, data_handle,
+						STARPU_R, new_handle,
+						0);
+			}
+		}
+	}
+	else {
+		_STARPU_MPI_DEBUG("Sending redux handle to %d ...\n", rank);
+		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
+	}
+}

+ 99 - 0
mpi/src/starpu_mpi_private.h

@@ -0,0 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_PRIVATE_H__
+#define __STARPU_MPI_PRIVATE_H__
+
+#include <starpu.h>
+#include <common/config.h>
+#include "starpu_mpi.h"
+#include "starpu_mpi_fxt.h"
+#include <common/list.h>
+#include <common/utils.h>
+#include <pthread.h>
+
+//#define STARPU_MPI_VERBOSE	1
+
+#ifdef STARPU_MPI_VERBOSE
+#  define _STARPU_MPI_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
+    						int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);       \
+                                                fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __func__ ,##args); \
+                                                fflush(stderr); }} while(0);
+#else
+#  define _STARPU_MPI_DEBUG(fmt, args ...)
+#endif
+
+#ifdef STARPU_MPI_VERBOSE0
+#  define _STARPU_MPI_LOG_IN()             do { if (!getenv("STARPU_SILENT")) { \
+                                               int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] -->\n", (_debug_rank+1)*4, "", _debug_rank, __func__ ); \
+                                               fflush(stderr); }} while(0)
+#  define _STARPU_MPI_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) { \
+                                               int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] <--\n", (_debug_rank+1)*4, "", _debug_rank, __func__ ); \
+                                               fflush(stderr); }} while(0)
+#else
+#  define _STARPU_MPI_LOG_IN()
+#  define _STARPU_MPI_LOG_OUT()
+#endif
+
+#define SEND_REQ	0
+#define RECV_REQ	1
+#define WAIT_REQ        2
+#define TEST_REQ        3
+#define BARRIER_REQ     4
+
+LIST_TYPE(_starpu_mpi_req,
+	/* description of the data at StarPU level */
+	starpu_data_handle_t data_handle;
+
+	/* description of the data to be sent/received */
+	MPI_Datatype datatype;
+	void *ptr;
+	int needs_unpacking;
+
+	/* who are we talking to ? */
+	int srcdst;
+	int mpi_tag;
+	MPI_Comm comm;
+
+	void (*func)(struct _starpu_mpi_req *);
+
+	MPI_Status *status;
+	MPI_Request request;
+	int *flag;
+
+	int ret;
+	pthread_mutex_t req_mutex;
+	pthread_cond_t req_cond;
+
+	unsigned request_type; /* 0 send, 1 recv */
+
+	unsigned submitted;
+	unsigned completed;
+
+	/* In the case of a Wait/Test request, we are going to post a request
+	 * to test the completion of another request */
+	struct _starpu_mpi_req *other_request;
+
+	/* in the case of detached requests */
+	unsigned detached;
+	void *callback_arg;
+	void (*callback)(void *);
+);
+
+#endif // __STARPU_MPI_PRIVATE_H__

+ 88 - 0
mpi/src/starpu_mpi_stats.c

@@ -0,0 +1,88 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi_stats.h>
+#include <common/config.h>
+#include <stdio.h>
+//#define STARPU_MPI_VERBOSE	1
+#include <starpu_mpi_private.h>
+
+/* measure the amount of data transfers between each pair of MPI nodes */
+#ifdef STARPU_COMM_STATS
+static size_t *comm_amount;
+static int world_size;
+#endif /* STARPU_COMM_STATS */
+
+void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
+{
+#ifdef STARPU_COMM_STATS
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-comm-stats, which slows down a bit\n");
+
+	MPI_Comm_size(comm, &world_size);
+	_STARPU_MPI_DEBUG("allocating for %d nodes\n", world_size);
+
+	comm_amount = (size_t *) calloc(world_size, sizeof(size_t));
+#endif /* STARPU_COMM_STATS */
+}
+
+void _starpu_mpi_comm_amounts_free()
+{
+#ifdef STARPU_COMM_STATS
+	free(comm_amount);
+#endif /* STARPU_COMM_STATS */
+}
+
+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm  __attribute__ ((unused)),
+				  unsigned dst  __attribute__ ((unused)),
+				  MPI_Datatype datatype  __attribute__ ((unused)),
+				  int count __attribute__ ((unused)))
+{
+#ifdef STARPU_COMM_STATS
+	int src, size;
+
+	MPI_Comm_rank(comm, &src);
+	MPI_Type_size(datatype, &size);
+
+	_STARPU_MPI_DEBUG("[%d] adding %d to %d\n", src, count*size, dst);
+
+	comm_amount[dst] += count*size;
+#endif /* STARPU_COMM_STATS */
+}
+
+void _starpu_mpi_comm_amounts_display(int node)
+{
+#ifdef STARPU_COMM_STATS
+	unsigned dst;
+	size_t sum = 0;
+
+	for (dst = 0; dst < world_size; dst++)
+	{
+		sum += comm_amount[dst];
+	}
+
+	fprintf(stderr, "\n[%d] Communication transfers stats:\nTOTAL transfers %f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
+
+	for (dst = 0; dst < world_size; dst++)
+	{
+		if (comm_amount[dst])
+		{
+			fprintf(stderr, "\t%d -> %d\t%f B\t%f MB\n",
+				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
+		}
+	}
+#endif /* STARPU_COMM_STATS */
+}
+

+ 24 - 0
mpi/src/starpu_mpi_stats.h

@@ -0,0 +1,24 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <mpi.h>
+
+void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
+void _starpu_mpi_comm_amounts_free();
+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count);
+void _starpu_mpi_comm_amounts_display(int node);
+

+ 29 - 0
mpi/starpumpi-1.0.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: starpu-1.0
+Requires.private:

+ 1 - 0
mpi/tests/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 153 - 0
mpi/tests/Makefile.am

@@ -0,0 +1,153 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+CC=$(MPICC)
+CCLD=$(MPICC)
+
+if STARPU_MPI_CHECK
+if STARPU_HAVE_AM111
+LOG_COMPILER	 	=	$(MPIEXEC) -np 2
+else
+TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 2
+endif
+TESTS			=	$(check_PROGRAMS)
+endif
+
+check_PROGRAMS =
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo
+
+examplebindir = $(libdir)/starpu/examples/mpi
+
+examplebin_PROGRAMS =
+
+if STARPU_USE_CUDA
+# TODO define NVCCFLAGS
+NVCC ?= nvcc
+
+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
+
+.cu.cubin:
+	$(MKDIR_P) `dirname $@`
+	$(NVCC) -cubin $< -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
+
+.cu.o:
+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/  -I$(top_builddir)/include/
+endif
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
+
+########################
+# Unit testcases       #
+########################
+
+check_PROGRAMS +=				\
+	pingpong				\
+	mpi_test				\
+	mpi_isend				\
+	mpi_irecv				\
+	mpi_isend_detached			\
+	mpi_irecv_detached			\
+	mpi_detached_tag			\
+	ring					\
+	ring_async				\
+	ring_async_implicit			\
+	block_interface				\
+	block_interface_pinned			\
+	insert_task				\
+	insert_task_cache			\
+	insert_task_block			\
+	insert_task_owner			\
+	insert_task_owner2			\
+	insert_task_owner_data			\
+	multiple_send
+
+noinst_PROGRAMS =				\
+	pingpong				\
+	mpi_test				\
+	mpi_isend				\
+	mpi_irecv				\
+	mpi_isend_detached			\
+	mpi_irecv_detached			\
+	mpi_detached_tag			\
+	ring					\
+	ring_async				\
+	ring_async_implicit			\
+	block_interface				\
+	block_interface_pinned			\
+	insert_task				\
+	insert_task_cache			\
+	insert_task_block			\
+	insert_task_owner			\
+	insert_task_owner2			\
+	insert_task_owner_data			\
+	multiple_send
+
+mpi_isend_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_irecv_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_isend_detached_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_irecv_detached_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_detached_tag_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+pingpong_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_test_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+ring_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+ring_async_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+ring_async_implicit_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+block_interface_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+block_interface_pinned_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_cache_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_block_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_owner_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_owner2_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_owner_data_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+multiple_send_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+ring_SOURCES = ring.c
+ring_async_SOURCES = ring_async.c
+ring_async_implicit_SOURCES = ring_async_implicit.c
+if STARPU_USE_CUDA
+ring_SOURCES += ring_kernel.cu
+ring_async_SOURCES += ring_kernel.cu
+ring_async_implicit_SOURCES += ring_kernel.cu
+endif
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 148 - 0
mpi/tests/block_interface.c

@@ -0,0 +1,148 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+#define NITER	2048
+
+#define BIGSIZE	128
+#define SIZE	64
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	/* We only use 2 nodes for that test */
+	if (rank >= 2)
+	{
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	/* Node 0 will allocate a big block and only register an inner part of
+	 * it as the block data, Node 1 will allocate a block of small size and
+	 * register it directly. Node 0 and 1 will then exchange the content of
+	 * their blocks. */
+
+	float *block;
+	starpu_data_handle_t block_handle;
+
+	if (rank == 0)
+	{
+		block = calloc(BIGSIZE*BIGSIZE*BIGSIZE, sizeof(float));
+		assert(block);
+
+		/* fill the inner block */
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] = 1.0f;
+		}
+
+		starpu_block_data_register(&block_handle, 0,
+			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
+			SIZE, SIZE, SIZE, sizeof(float));
+	}
+	else /* rank == 1 */
+	{
+		block = calloc(SIZE*SIZE*SIZE, sizeof(float));
+		assert(block);
+
+		starpu_block_data_register(&block_handle, 0,
+			(uintptr_t)block, SIZE, SIZE*SIZE,
+			SIZE, SIZE, SIZE, sizeof(float));
+	}
+
+	if (rank == 0)
+	{
+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+
+		MPI_Status status;
+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		/* check the content of the block */
+		ret = starpu_data_acquire(block_handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
+		}
+		starpu_data_release(block_handle);
+
+	}
+	else /* rank == 1 */
+	{
+		MPI_Status status;
+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		/* check the content of the block and modify it */
+		ret = starpu_data_acquire(block_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			assert(block[i + j*SIZE + k*SIZE*SIZE] == 1.0f);
+			block[i + j*SIZE + k*SIZE*SIZE] = 33.0f;
+		}
+		starpu_data_release(block_handle);
+
+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+	}
+
+	FPRINTF(stdout, "Rank %d is done\n", rank);
+	fflush(stdout);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 151 - 0
mpi/tests/block_interface_pinned.c

@@ -0,0 +1,151 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+#define NITER	2048
+
+#define BIGSIZE	64
+#define SIZE	64
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	/* We only use 2 nodes for that test */
+	if (rank >= 2)
+	{
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	/* Node 0 will allocate a big block and only register an inner part of
+	 * it as the block data, Node 1 will allocate a block of small size and
+	 * register it directly. Node 0 and 1 will then exchange the content of
+	 * their blocks. */
+
+	float *block;
+	starpu_data_handle_t block_handle;
+
+	if (rank == 0)
+	{
+		starpu_malloc((void **)&block,
+				BIGSIZE*BIGSIZE*BIGSIZE*sizeof(float));
+		memset(block, 0, BIGSIZE*BIGSIZE*BIGSIZE*sizeof(float));
+
+		/* fill the inner block */
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] = 1.0f;
+		}
+
+		starpu_block_data_register(&block_handle, 0,
+			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
+			SIZE, SIZE, SIZE, sizeof(float));
+	}
+	else /* rank == 1 */
+	{
+		starpu_malloc((void **)&block,
+			SIZE*SIZE*SIZE*sizeof(float));
+		memset(block, 0, SIZE*SIZE*SIZE*sizeof(float));
+
+		starpu_block_data_register(&block_handle, 0,
+			(uintptr_t)block, SIZE, SIZE*SIZE,
+			SIZE, SIZE, SIZE, sizeof(float));
+	}
+
+	if (rank == 0)
+	{
+		MPI_Status status;
+
+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+
+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		/* check the content of the block */
+		starpu_data_acquire(block_handle, STARPU_R);
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
+		}
+		starpu_data_release(block_handle);
+
+	}
+	else /* rank == 1 */
+	{
+		MPI_Status status;
+
+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		/* check the content of the block and modify it */
+		ret = starpu_data_acquire(block_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			assert(block[i + j*SIZE + k*SIZE*SIZE] == 1.0f);
+			block[i + j*SIZE + k*SIZE*SIZE] = 33.0f;
+		}
+		starpu_data_release(block_handle);
+
+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+
+	}
+
+	FPRINTF(stdout, "Rank %d is done\n", rank);
+	fflush(stdout);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 22 - 0
mpi/tests/helper.h

@@ -0,0 +1,22 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <errno.h>
+
+#define STARPU_TEST_SKIPPED 77
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+

+ 143 - 0
mpi/tests/insert_task.c

@@ -0,0 +1,143 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
+        *x = (*x + *y) / 2;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+#define X     4
+#define Y     5
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+        return x % nb_nodes;
+}
+
+
+int main(int argc, char **argv)
+{
+        int rank, size, x, y;
+        int value=0, ret;
+        unsigned matrix[X][Y];
+        starpu_data_handle_t data_handles[X][Y];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        matrix[x][y] = (rank+1)*10 + value;
+                        value++;
+                }
+        }
+#if 0
+        for(x = 0; x < X; x++) {
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < Y; y++) {
+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
+                }
+                FPRINTF(stdout, "\n");
+        }
+#endif
+
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        int mpi_rank = my_distrib(x, y, size);
+                        if (mpi_rank == rank)
+			{
+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
+                        }
+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+			{
+                                /* I don't own that index, but will need it for my computations */
+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
+                        }
+                        else
+			{
+                                /* I know it's useless to allocate anything for this */
+                                data_handles[x][y] = NULL;
+                        }
+                        if (data_handles[x][y])
+			{
+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
+                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
+			}
+                }
+        }
+
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
+        FPRINTF(stderr, "Waiting ...\n");
+        starpu_task_wait_for_all();
+
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        if (data_handles[x][y])
+                                starpu_data_unregister(data_handles[x][y]);
+                }
+        }
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+#if 0
+        for(x = 0; x < X; x++)
+	{
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < Y; y++)
+		{
+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
+                }
+                FPRINTF(stdout, "\n");
+        }
+#endif
+
+	return 0;
+}

+ 165 - 0
mpi/tests/insert_task_block.c

@@ -0,0 +1,165 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *matrix = (unsigned *)STARPU_MATRIX_GET_PTR(descr[0]);
+	int nx = (int)STARPU_MATRIX_GET_NX(descr[0]);
+	int ny = (int)STARPU_MATRIX_GET_NY(descr[0]);
+	int ld = (int)STARPU_MATRIX_GET_LD(descr[0]);
+
+        int i, j;
+        unsigned sum=0;
+
+	for (i = 0; i < nx; i++)
+	{
+		for (j = 0; j < ny; j++)
+		{
+                        sum += matrix[i+j*ld];
+                }
+        }
+	for (i = 0; i < nx; i++)
+	{
+		for (j = 0; j < ny; j++)
+		{
+                        matrix[i+j*ld] = sum;///(nx*ny);
+                }
+        }
+}
+
+struct starpu_codelet mycodelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+#define SIZE       6
+#define BLOCKS     3
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+        return x % nb_nodes;
+}
+
+
+int main(int argc, char **argv)
+{
+        int rank, size, x, y;
+        int ret, value=0;
+        unsigned matrix[SIZE*SIZE];
+        starpu_data_handle_t data_handles[SIZE][SIZE];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+        for(x = 0; x < SIZE; x++)
+	{
+                for (y = 0; y < SIZE; y++)
+		{
+                        matrix[x+y*SIZE] = rank*100 + value;
+                        value++;
+                }
+        }
+#if 1
+        for(x = 0; x < SIZE; x++) {
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < SIZE; y++) {
+                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
+                }
+                FPRINTF(stdout, "\n");
+        }
+#endif
+
+        for(x = 0; x < BLOCKS ;  x++)
+	{
+                for (y = 0; y < BLOCKS; y++)
+		{
+                        int mpi_rank = my_distrib(x, y, size);
+                        if (mpi_rank == rank)
+			{
+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                                starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
+                                                            SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
+                        }
+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+			{
+                                /* I don't own that index, but will need it for my computations */
+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
+                                                            SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
+                        }
+                        else
+			{
+                                /* I know it's useless to allocate anything for this */
+                                data_handles[x][y] = NULL;
+                        }
+                        if (data_handles[x][y])
+			{
+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
+                                starpu_data_set_tag(data_handles[x][y], (y*BLOCKS)+x);
+			}
+                }
+        }
+
+        for(x = 0; x < BLOCKS; x++)
+	{
+                for (y = 0; y < BLOCKS; y++)
+		{
+                        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+						     STARPU_RW, data_handles[x][y],
+						     0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
+                }
+        }
+
+        FPRINTF(stderr, "Waiting ...\n");
+        starpu_task_wait_for_all();
+
+        for(x = 0; x < BLOCKS; x++)
+	{
+                for (y = 0; y < BLOCKS; y++)
+		{
+                        if (data_handles[x][y])
+                                starpu_data_unregister(data_handles[x][y]);
+                }
+        }
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+#if 1
+        for(x = 0; x < SIZE; x++)
+	{
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < SIZE; y++) {
+                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
+                }
+                FPRINTF(stdout, "\n");
+        }
+#endif
+
+	return 0;
+}

+ 152 - 0
mpi/tests/insert_task_cache.c

@@ -0,0 +1,152 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
+        *x = (*x + *y) / 2;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+#define X     4
+#define Y     5
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+        return x % nb_nodes;
+}
+
+
+int main(int argc, char **argv)
+{
+        int rank, size, x, y;
+        int ret,value=0;
+        unsigned matrix[X][Y];
+        starpu_data_handle_t data_handles[X][Y];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        matrix[x][y] = (rank+1)*10 + value;
+                        value++;
+                }
+        }
+#if 0
+        for(x = 0; x < X; x++)
+	{
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < Y; y++)
+		{
+                        FPRINTF(stdout, "%3u ", matrix[x][y]);
+                }
+                FPRINTF(stdout, "\n");
+        }
+#endif
+
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        int mpi_rank = my_distrib(x, y, size);
+                        if (mpi_rank == rank)
+			{
+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
+                        }
+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+			{
+                                /* I don't own that index, but will need it for my computations */
+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
+                        }
+                        else
+			{
+                                /* I know it's useless to allocate anything for this */
+                                data_handles[x][y] = NULL;
+                        }
+                        if (data_handles[x][y])
+			{
+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
+                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
+			}
+                }
+        }
+
+	mycodelet.name = "codelet1";
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
+	mycodelet.name = "codelet2";
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
+	mycodelet.name = "codelet3";
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
+	mycodelet.name = "codelet4";
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
+        FPRINTF(stderr, "Waiting ...\n");
+        starpu_task_wait_for_all();
+
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        if (data_handles[x][y])
+                                starpu_data_unregister(data_handles[x][y]);
+                }
+        }
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+#if 0
+        for(x = 0; x < X; x++)
+	{
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < Y; y++)
+		{
+                        FPRINTF(stdout, "%3u ", matrix[x][y]);
+                }
+                FPRINTF(stdout, "\n");
+        }
+#endif
+
+	return 0;
+}

+ 180 - 0
mpi/tests/insert_task_owner.c

@@ -0,0 +1,180 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	int node;
+	int rank;
+
+        starpu_codelet_unpack_args(_args, &node);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	FPRINTF(stderr, "Expected node: %d - Actual node: %d\n", node, rank);
+
+	assert(node == rank);
+}
+
+struct starpu_codelet mycodelet_r_w =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+struct starpu_codelet mycodelet_rw_r =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+struct starpu_codelet mycodelet_rw_rw =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
+};
+
+struct starpu_codelet mycodelet_w_r =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R}
+};
+
+struct starpu_codelet mycodelet_r_r =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_R}
+};
+
+int main(int argc, char **argv)
+{
+        int ret, rank, size, err, node;
+        int x0=32, x1=23;
+        starpu_data_handle_t data_handlesx0;
+        starpu_data_handle_t data_handlesx1;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+        if (size != 2)
+	{
+		if (rank == 0) FPRINTF(stderr, "We need exactly 2 processes.\n");
+                starpu_mpi_shutdown();
+                starpu_shutdown();
+                return STARPU_TEST_SKIPPED;
+        }
+
+        if (rank == 0)
+	{
+                starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
+                starpu_data_set_rank(data_handlesx0, rank);
+		starpu_data_set_tag(data_handlesx0, 0);
+                starpu_variable_data_register(&data_handlesx1, -1, (uintptr_t)NULL, sizeof(int));
+                starpu_data_set_rank(data_handlesx1, 1);
+		starpu_data_set_tag(data_handlesx1, 1);
+        }
+        else if (rank == 1)
+	{
+                starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
+                starpu_data_set_rank(data_handlesx1, rank);
+		starpu_data_set_tag(data_handlesx1, 1);
+                starpu_variable_data_register(&data_handlesx0, -1, (uintptr_t)NULL, sizeof(int));
+                starpu_data_set_rank(data_handlesx0, 0);
+		starpu_data_set_tag(data_handlesx0, 0);
+        }
+
+	node = starpu_data_get_rank(data_handlesx1);
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
+				     0);
+        assert(err == 0);
+
+	node = starpu_data_get_rank(data_handlesx0);
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
+				     0);
+        assert(err == 0);
+
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
+				     0);
+        assert(err == -EINVAL);
+
+	node = 1;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+        assert(err == 0);
+
+	node = 0;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+        assert(err == 0);
+
+	node = 0;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+        assert(err == 0);
+
+        /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
+           going to overwrite the node even though the data model clearly specifies
+           which node is going to execute the codelet */
+	node = 0;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+        assert(err == 0);
+
+        /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
+           going to overwrite the node even though the data model clearly specifies
+           which node is going to execute the codelet */
+	node = 0;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+        assert(err == 0);
+
+	fprintf(stderr, "Waiting ...\n");
+        starpu_task_wait_for_all();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}
+

+ 120 - 0
mpi/tests/insert_task_owner2.c

@@ -0,0 +1,120 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	int *x2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
+	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[3]);
+
+//        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+//
+//        *x2 = 45;
+//        *y = 144;
+//
+        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+        *y = (*x0 + *x1) * 100;
+        *x1 = 12;
+        *x2 = 24;
+        *x0 = 36;
+        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+}
+
+struct starpu_codelet mycodelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 4,
+	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
+};
+
+int main(int argc, char **argv)
+{
+        int rank, size, err;
+        int x[3], y=0;
+        int i, ret;
+        starpu_data_handle_t data_handles[4];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+        if (rank == 0)
+	{
+                for(i=0 ; i<3 ; i++)
+		{
+                        x[i] = 10*(i+1);
+                        starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
+                }
+                y = -1;
+                starpu_variable_data_register(&data_handles[3], -1, (uintptr_t)NULL, sizeof(int));
+        }
+        else if (rank == 1)
+	{
+                for(i=0 ; i<3 ; i++)
+		{
+                        x[i] = -1;
+                        starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
+                }
+                y=200;
+                starpu_variable_data_register(&data_handles[3], 0, (uintptr_t)&y, sizeof(int));
+        } else
+	{
+                for(i=0 ; i<4 ; i++)
+                        starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
+	}
+        FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
+
+	for(i=0 ; i<3 ; i++)
+	{
+		starpu_data_set_rank(data_handles[i], 0);
+		starpu_data_set_tag(data_handles[i], i);
+	}
+	starpu_data_set_rank(data_handles[3], 1);
+	starpu_data_set_tag(data_handles[3], 3);
+
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+                                     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
+                                     STARPU_W, data_handles[2],
+                                     STARPU_W, data_handles[3],
+                                     STARPU_EXECUTE_ON_NODE, 1, 0);
+	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
+        starpu_task_wait_for_all();
+
+        int *values = malloc(4 * sizeof(int *));
+        for(i=0 ; i<4 ; i++)
+	{
+                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
+		if (rank == 0) {
+			starpu_data_acquire(data_handles[i], STARPU_R);
+			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
+		}
+        }
+        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
+        FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}
+

+ 99 - 0
mpi/tests/insert_task_owner_data.c

@@ -0,0 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	*x0 += 1;
+	*x1 *= *x1;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
+};
+
+int main(int argc, char **argv)
+{
+        int rank, size, err;
+        int x[2];
+        int ret, i;
+        starpu_data_handle_t data_handles[2];
+	int values[2];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+        if (rank == 0)
+	{
+		x[0] = 11;
+		starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x[0], sizeof(x[0]));
+		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
+        }
+        else if (rank == 1)
+	{
+		x[1] = 12;
+		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
+		starpu_variable_data_register(&data_handles[1], 0, (uintptr_t)&x[1], sizeof(x[1]));
+        }
+	else
+	{
+		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
+		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
+        }
+
+	starpu_data_set_rank(data_handles[0], 0);
+	starpu_data_set_tag(data_handles[0], 0);
+	starpu_data_set_rank(data_handles[1], 1);
+	starpu_data_set_tag(data_handles[1], 1);
+
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+                                     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
+                                     STARPU_EXECUTE_ON_DATA, data_handles[1],
+				     0);
+        assert(err == 0);
+        starpu_task_wait_for_all();
+
+        for(i=0 ; i<2 ; i++)
+	{
+                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
+		if (rank == 0) {
+			starpu_data_acquire(data_handles[i], STARPU_R);
+			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
+		}
+        }
+        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
+	ret = 0;
+	if (rank == 0 && (values[0] != 12 || values[1] != 144))
+		ret = EXIT_FAILURE;
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return ret;
+}
+

+ 80 - 0
mpi/tests/mpi_detached_tag.c

@@ -0,0 +1,80 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size != 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	int other_rank = (rank + 1)%2;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		starpu_tag_t tag = (starpu_tag_t)loop;
+
+		if ((loop % 2) == rank)
+		{
+			starpu_mpi_isend_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
+		}
+		else
+		{
+			starpu_mpi_irecv_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
+		}
+
+		starpu_tag_wait(tag);
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 79 - 0
mpi/tests/mpi_irecv.c

@@ -0,0 +1,79 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size != 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	int other_rank = (rank + 1)%2;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if ((loop % 2) == rank)
+		{
+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_req req;
+			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+			starpu_mpi_wait(&req, &status);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 97 - 0
mpi/tests/mpi_irecv_detached.c

@@ -0,0 +1,97 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <common/utils.h>
+#include "helper.h"
+
+#define NITER	2048
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+
+void callback(void *arg __attribute__((unused)))
+{
+	unsigned *received = arg;
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	*received = 1;
+	_STARPU_PTHREAD_COND_SIGNAL(&cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+}
+
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size != 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	int other_rank = (rank + 1)%2;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if (rank == 0)
+		{
+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
+		}
+		else
+		{
+			int received = 0;
+			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
+
+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+			while (!received)
+				_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 80 - 0
mpi/tests/mpi_isend.c

@@ -0,0 +1,80 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size != 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	int other_rank = (rank + 1)%2;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if ((loop % 2) == rank)
+		{
+			MPI_Status status;
+			starpu_mpi_req req;
+			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+			starpu_mpi_wait(&req, &status);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 98 - 0
mpi/tests/mpi_isend_detached.c

@@ -0,0 +1,98 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <common/utils.h>
+#include <pthread.h>
+#include "helper.h"
+
+#define NITER	2048
+#define SIZE	16
+
+static float *tab;
+static starpu_data_handle_t tab_handle;
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+
+void callback(void *arg __attribute__((unused)))
+{
+	unsigned *sent = arg;
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	*sent = 1;
+	_STARPU_PTHREAD_COND_SIGNAL(&cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size != 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	int other_rank = (rank + 1)%2;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if (rank == 0)
+		{
+			int sent = 0;
+			starpu_mpi_isend_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &sent);
+
+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+			while (!sent)
+				_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 86 - 0
mpi/tests/mpi_test.c

@@ -0,0 +1,86 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size != 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	int other_rank = (rank + 1)%2;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		starpu_mpi_req req;
+
+		if ((loop % 2) == rank)
+		{
+                        starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+		}
+		else
+		{
+			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+		}
+
+		int finished = 0;
+		do
+		{
+			MPI_Status status;
+			starpu_mpi_test(&req, &finished, &status);
+		}
+		while (!finished);
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 92 - 0
mpi/tests/multiple_send.c

@@ -0,0 +1,92 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+        unsigned send[2] = {42, 11};
+        unsigned recv[2] = {33, 33};
+        starpu_mpi_req req[2];
+        starpu_data_handle_t send_handle[2];
+        starpu_data_handle_t recv_handle[2];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+                starpu_mpi_shutdown();
+                starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	starpu_variable_data_register(&send_handle[0], 0, (uintptr_t)&send[0], sizeof(unsigned));
+	starpu_variable_data_register(&send_handle[1], 0, (uintptr_t)&send[1], sizeof(unsigned));
+	starpu_variable_data_register(&recv_handle[0], 0, (uintptr_t)&recv[0], sizeof(unsigned));
+	starpu_variable_data_register(&recv_handle[1], 0, (uintptr_t)&recv[1], sizeof(unsigned));
+
+        if (rank == 0)
+	{
+                starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
+                starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
+        }
+        else if (rank == 1)
+	{
+                starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
+                starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
+        }
+
+        if (rank == 0 || rank == 1)
+	{
+                int nb_req=2;
+                while (nb_req)
+		{
+                        int r=0;
+                        for(r=0 ; r<2 ; r++)
+			{
+                                if (req[r])
+				{
+                                        int finished = 0;
+                                        MPI_Status status;
+                                        starpu_mpi_test(&req[r], &finished, &status);
+                                        STARPU_ASSERT(finished != -1);
+                                        if (finished)
+					{
+                                                FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
+                                                req[r] = NULL;
+                                                nb_req--;
+                                        }
+                                }
+                        }
+                }
+        }
+        FPRINTF(stderr, "[%d] All requests finished\n", rank);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 76 - 0
mpi/tests/pingpong.c

@@ -0,0 +1,76 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size != 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+	int other_rank = (rank + 1)%2;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if ((loop % 2) == rank)
+		{
+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 129 - 0
mpi/tests/ring.c

@@ -0,0 +1,129 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+
+unsigned token = 42;
+starpu_data_handle_t token_handle;
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda, NULL},
+#endif
+	.cpu_funcs = {increment_cpu, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+void increment_token(void)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+	task->synchronous = 1;
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	unsigned last_loop = nloops - 1;
+	unsigned last_rank = size - 1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		int tag = loop*size + rank;
+
+		if (loop == 0 && rank == 0)
+		{
+			token = 0;
+			FPRINTF(stdout, "Start with token value %u\n", token);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_recv(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, &status);
+		}
+
+		increment_token();
+
+		if (loop == last_loop && rank == last_rank)
+		{
+			starpu_data_acquire(token_handle, STARPU_R);
+			FPRINTF(stdout, "Finished : token value %u\n", token);
+			starpu_data_release(token_handle);
+		}
+		else
+		{
+			starpu_mpi_send(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	if (rank == last_rank)
+	{
+		STARPU_ASSERT(token == nloops*size);
+	}
+
+	return 0;
+}

+ 133 - 0
mpi/tests/ring_async.c

@@ -0,0 +1,133 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+
+unsigned token = 42;
+starpu_data_handle_t token_handle;
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda, NULL},
+#endif
+	.cpu_funcs = {increment_cpu, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+void increment_token(void)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+	task->synchronous = 1;
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
+
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	unsigned last_loop = nloops - 1;
+	unsigned last_rank = size - 1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		int tag = loop*size + rank;
+
+		if (loop == 0 && rank == 0)
+		{
+			token = 0;
+			FPRINTF(stdout, "Start with token value %u\n", token);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_req req;
+			starpu_mpi_irecv(token_handle, &req, (rank+size-1)%size, tag, MPI_COMM_WORLD);
+			starpu_mpi_wait(&req, &status);
+		}
+
+		increment_token();
+
+		if (loop == last_loop && rank == last_rank)
+		{
+			starpu_data_acquire(token_handle, STARPU_R);
+			FPRINTF(stdout, "Finished : token value %u\n", token);
+			starpu_data_release(token_handle);
+		}
+		else {
+			starpu_mpi_req req;
+			MPI_Status status;
+			starpu_mpi_isend(token_handle, &req, (rank+1)%size, tag+1, MPI_COMM_WORLD);
+			starpu_mpi_wait(&req, &status);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	if (rank == last_rank)
+	{
+		STARPU_ASSERT(token == nloops*size);
+	}
+
+	return 0;
+}

+ 133 - 0
mpi/tests/ring_async_implicit.c

@@ -0,0 +1,133 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NITER	2048
+
+unsigned token = 42;
+starpu_data_handle_t token_handle;
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
+#endif
+
+void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda, NULL},
+#endif
+	.cpu_funcs = {increment_cpu, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+void increment_token(void)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+#if 0
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+#endif
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+
+	unsigned nloops = NITER;
+	unsigned loop;
+
+	unsigned last_loop = nloops - 1;
+	unsigned last_rank = size - 1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		int tag = loop*size + rank;
+
+		if (loop == 0 && rank == 0)
+		{
+			token = 0;
+			FPRINTF(stdout, "Start with token value %u\n", token);
+		}
+		else
+		{
+			starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, NULL, NULL);
+		}
+
+		increment_token();
+
+		if (loop == last_loop && rank == last_rank)
+		{
+			starpu_data_acquire(token_handle, STARPU_R);
+			FPRINTF(stdout, "Finished : token value %u\n", token);
+			starpu_data_release(token_handle);
+		}
+		else
+		{
+			starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+        //MPI_Finalize();
+
+	if (rank == last_rank)
+	{
+                FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
+                STARPU_ASSERT(token == nloops*size);
+	}
+
+	return 0;
+}

+ 32 - 0
mpi/tests/ring_kernel.cu

@@ -0,0 +1,32 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+static __global__ void cuda_incrementer(unsigned *token)
+{
+	(*token)++;
+}
+
+extern "C" void increment_cuda(void *descr[], void *_args)
+{
+	(void) _args;
+	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}