12 years ago · 1967c51c43
--- a/mpi/.gitignore
+++ b/mpi/.gitignore
@@ -0,0 +1 @@
 
																+/.deps
															
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -0,0 +1,29 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+SUBDIRS=src tests examples
															
 
																+
															
 
																+pkgconfigdir = $(libdir)/pkgconfig
															
 
																+pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc
															
 
																+
															
 
																+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
															
 
																+versinclude_HEADERS = 					\
															
 
																+	include/starpu_mpi.h
															
 
																+
															
 
																+showcheck:
															
 
																+	for i in $(SUBDIRS) ; do \
															
 
																+		make -C $$i showcheck ; \
															
 
																+	done
															
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -0,0 +1,206 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+CC=$(MPICC)
															
 
																+CCLD=$(MPICC)
															
 
																+
															
 
																+if STARPU_MPI_CHECK
															
 
																+if STARPU_HAVE_AM111
															
 
																+LOG_COMPILER	 	=	$(MPIEXEC) -np 2
															
 
																+else
															
 
																+TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 2
															
 
																+endif
															
 
																+TESTS			=	$(check_PROGRAMS)
															
 
																+endif
															
 
																+
															
 
																+check_PROGRAMS =
															
 
																+
															
 
																+BUILT_SOURCES =
															
 
																+
															
 
																+CLEANFILES = *.gcno *.gcda *.linkinfo
															
 
																+
															
 
																+EXTRA_DIST = 					\
															
 
																+	mpi_lu/mpi_lu-float.h		\
															
 
																+	mpi_lu/mpi_lu-double.h		\
															
 
																+	mpi_lu/plu_example.c		\
															
 
																+	mpi_lu/plu_solve.c		\
															
 
																+	mpi_lu/pxlu.h			\
															
 
																+	mpi_lu/pxlu.c			\
															
 
																+	mpi_lu/pxlu_kernels.h		\
															
 
																+	mpi_lu/pxlu_kernels.c		\
															
 
																+	cholesky/mpi_cholesky.h	\
															
 
																+	cholesky/mpi_cholesky_models.h \
															
 
																+	cholesky/mpi_cholesky_codelets.h \
															
 
																+	../tests/helper.h
															
 
																+
															
 
																+examplebindir = $(libdir)/starpu/mpi
															
 
																+
															
 
																+examplebin_PROGRAMS =
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+# TODO define NVCCFLAGS
															
 
																+NVCC ?= nvcc
															
 
																+
															
 
																+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
															
 
																+
															
 
																+.cu.cubin:
															
 
																+	$(MKDIR_P) `dirname $@`
															
 
																+	$(NVCC) -cubin $< -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
															
 
																+
															
 
																+.cu.o:
															
 
																+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/  -I$(top_builddir)/include/
															
 
																+endif
															
 
																+
															
 
																+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
															
 
																+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
															
 
																+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
															
 
																+
															
 
																+###################
															
 
																+# Stencil example #
															
 
																+###################
															
 
																+if BUILD_EXAMPLES
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	stencil/stencil5
															
 
																+
															
 
																+stencil_stencil5_LDADD =		\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+
															
 
																+check_PROGRAMS	+=	\
															
 
																+	stencil/stencil5
															
 
																+
															
 
																+##################
															
 
																+# MPI LU example #
															
 
																+##################
															
 
																+
															
 
																+if !NO_BLAS_LIB
															
 
																+
															
 
																+examplebin_PROGRAMS += 			\
															
 
																+	mpi_lu/plu_example_float	\
															
 
																+	mpi_lu/plu_example_double
															
 
																+
															
 
																+mpi_lu_plu_example_float_LDADD =	\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																+	$(STARPU_LIBNUMA_LDFLAGS)				\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+mpi_lu_plu_example_float_SOURCES =	\
															
 
																+	mpi_lu/plu_example_float.c	\
															
 
																+	mpi_lu/plu_solve_float.c	\
															
 
																+	mpi_lu/pslu_kernels.c		\
															
 
																+	mpi_lu/pslu.c			\
															
 
																+	$(top_srcdir)/examples/common/blas.c
															
 
																+
															
 
																+mpi_lu_plu_example_double_LDADD =	\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																+	$(STARPU_LIBNUMA_LDFLAGS)				\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+mpi_lu_plu_example_double_SOURCES =	\
															
 
																+	mpi_lu/plu_example_double.c	\
															
 
																+	mpi_lu/plu_solve_double.c  	\
															
 
																+	mpi_lu/pdlu_kernels.c	    	\
															
 
																+	mpi_lu/pdlu.c		    	\
															
 
																+	$(top_srcdir)/examples/common/blas.c
															
 
																+endif
															
 
																+
															
 
																+########################
															
 
																+# MPI Cholesky example #
															
 
																+########################
															
 
																+
															
 
																+if !NO_BLAS_LIB
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	cholesky/mpi_cholesky			\
															
 
																+	cholesky/mpi_cholesky_distributed
															
 
																+
															
 
																+cholesky_mpi_cholesky_SOURCES	=		\
															
 
																+	cholesky/mpi_cholesky.c		\
															
 
																+	cholesky/mpi_cholesky_models.c		\
															
 
																+	cholesky/mpi_cholesky_kernels.c	\
															
 
																+	cholesky/mpi_cholesky_codelets.c	\
															
 
																+	$(top_srcdir)/examples/common/blas.c
															
 
																+
															
 
																+cholesky_mpi_cholesky_LDADD =			\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+cholesky_mpi_cholesky_distributed_SOURCES =	\
															
 
																+	cholesky/mpi_cholesky_distributed.c	\
															
 
																+	cholesky/mpi_cholesky_models.c		\
															
 
																+	cholesky/mpi_cholesky_kernels.c	\
															
 
																+	cholesky/mpi_cholesky_codelets.c	\
															
 
																+	$(top_srcdir)/examples/common/blas.c
															
 
																+
															
 
																+cholesky_mpi_cholesky_distributed_LDADD =	\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+check_PROGRAMS +=					\
															
 
																+	cholesky/mpi_cholesky			\
															
 
																+	cholesky/mpi_cholesky_distributed
															
 
																+endif
															
 
																+
															
 
																+########################
															
 
																+# Scatter Gather       #
															
 
																+########################
															
 
																+
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	scatter_gather/mpi_scatter_gather
															
 
																+
															
 
																+scatter_gather_mpi_scatter_gather_LDADD =	\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+
															
 
																+check_PROGRAMS +=		\
															
 
																+	scatter_gather/mpi_scatter_gather
															
 
																+
															
 
																+###################
															
 
																+# Reduction       #
															
 
																+###################
															
 
																+
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	reduction/mpi_reduction
															
 
																+
															
 
																+reduction_mpi_reduction_SOURCES =		\
															
 
																+	reduction/mpi_reduction.c		\
															
 
																+	reduction/mpi_reduction_kernels.c
															
 
																+
															
 
																+reduction_mpi_reduction_LDADD =	\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+
															
 
																+check_PROGRAMS +=		\
															
 
																+	reduction/mpi_reduction
															
 
																+
															
 
																+###################
															
 
																+# complex example #
															
 
																+###################
															
 
																+
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	complex/mpi_complex
															
 
																+
															
 
																+complex_mpi_complex_SOURCES =		\
															
 
																+	complex/mpi_complex.c		\
															
 
																+	$(top_srcdir)/examples/interface/complex_interface.c
															
 
																+
															
 
																+complex_mpi_complex_LDADD =		\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+
															
 
																+check_PROGRAMS	+=	\
															
 
																+	complex/mpi_complex
															
 
																+endif
															
 
																+
															
 
																+
															
 
																+showcheck:
															
 
																+	-cat $(TEST_LOGS) /dev/null
															
--- a/mpi/examples/cholesky/mpi_cholesky.c
+++ b/mpi/examples/cholesky/mpi_cholesky.c
@@ -0,0 +1,251 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "mpi_cholesky.h"
															
 
																+#include "mpi_cholesky_models.h"
															
 
																+#include "mpi_cholesky_codelets.h"
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int y, int nb_nodes)
															
 
																+{
															
 
																+	//return (x+y) % nb_nodes;
															
 
																+	return (x%dblockx)+(y%dblocky)*dblockx;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	/* create a simple definite positive symetric matrix example
															
 
																+	 *
															
 
																+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
															
 
																+	 * */
															
 
																+
															
 
																+	float ***bmat;
															
 
																+	int rank, nodes, ret;
															
 
																+
															
 
																+	parse_args(argc, argv);
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+	starpu_mpi_initialize_extended(&rank, &nodes);
															
 
																+	starpu_helper_cublas_init();
															
 
																+
															
 
																+	if (dblockx == -1 || dblocky == -1)
															
 
																+	{
															
 
																+	     int factor;
															
 
																+	     dblockx = nodes;
															
 
																+	     dblocky = 1;
															
 
																+	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
															
 
																+	     {
															
 
																+		  if (nodes % factor == 0)
															
 
																+		  {
															
 
																+		       dblockx = nodes/factor;
															
 
																+		       dblocky = factor;
															
 
																+		       break;
															
 
																+		  }
															
 
																+	     }
															
 
																+	}
															
 
																+
															
 
																+	unsigned i,j,x,y;
															
 
																+	bmat = malloc(nblocks * sizeof(float *));
															
 
																+	for(x=0 ; x<nblocks ; x++)
															
 
																+	{
															
 
																+		bmat[x] = malloc(nblocks * sizeof(float *));
															
 
																+		for(y=0 ; y<nblocks ; y++)
															
 
																+		{
															
 
																+			starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
															
 
																+			for (i = 0; i < BLOCKSIZE; i++)
															
 
																+			{
															
 
																+				for (j = 0; j < BLOCKSIZE; j++)
															
 
																+				{
															
 
																+					bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
															
 
																+					//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	if (display)
															
 
																+	{
															
 
																+		printf("[%d] Input :\n", rank);
															
 
																+
															
 
																+		for(y=0 ; y<nblocks ; y++)
															
 
																+		{
															
 
																+			for(x=0 ; x<nblocks ; x++)
															
 
																+			{
															
 
																+				printf("Block %u,%u :\n", x, y);
															
 
																+				for (j = 0; j < BLOCKSIZE; j++)
															
 
																+				{
															
 
																+					for (i = 0; i < BLOCKSIZE; i++)
															
 
																+					{
															
 
																+						if (i <= j)
															
 
																+						{
															
 
																+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
															
 
																+						}
															
 
																+						else
															
 
																+						{
															
 
																+							printf(".\t");
															
 
																+						}
															
 
																+					}
															
 
																+					printf("\n");
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	double timing, flops;
															
 
																+	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+
															
 
																+	if (display)
															
 
																+	{
															
 
																+		printf("[%d] Results :\n", rank);
															
 
																+		for(y=0 ; y<nblocks ; y++)
															
 
																+		{
															
 
																+			for(x=0 ; x<nblocks ; x++)
															
 
																+			{
															
 
																+				printf("Block %u,%u :\n", x, y);
															
 
																+				for (j = 0; j < BLOCKSIZE; j++)
															
 
																+				{
															
 
																+					for (i = 0; i < BLOCKSIZE; i++)
															
 
																+					{
															
 
																+						if (i <= j)
															
 
																+						{
															
 
																+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
															
 
																+						}
															
 
																+						else
															
 
																+						{
															
 
																+							printf(".\t");
															
 
																+						}
															
 
																+					}
															
 
																+					printf("\n");
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	float *rmat = malloc(size*size*sizeof(float));
															
 
																+	for(x=0 ; x<nblocks ; x++)
															
 
																+	{
															
 
																+		for(y=0 ; y<nblocks ; y++)
															
 
																+		{
															
 
																+			for (i = 0; i < BLOCKSIZE; i++)
															
 
																+			{
															
 
																+				for (j = 0; j < BLOCKSIZE; j++)
															
 
																+				{
															
 
																+					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
															
 
																+	for (j = 0; j < size; j++)
															
 
																+	{
															
 
																+		for (i = 0; i < size; i++)
															
 
																+		{
															
 
																+			if (i > j)
															
 
																+			{
															
 
																+				rmat[j+i*size] = 0.0f; // debug
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	float *test_mat = malloc(size*size*sizeof(float));
															
 
																+	STARPU_ASSERT(test_mat);
															
 
																+
															
 
																+	SSYRK("L", "N", size, size, 1.0f,
															
 
																+			rmat, size, 0.0f, test_mat, size);
															
 
																+
															
 
																+	fprintf(stderr, "[%d] comparing results ...\n", rank);
															
 
																+	if (display)
															
 
																+	{
															
 
																+		for (j = 0; j < size; j++)
															
 
																+		{
															
 
																+			for (i = 0; i < size; i++)
															
 
																+			{
															
 
																+				if (i <= j)
															
 
																+				{
															
 
																+					printf("%2.2f\t", test_mat[j +i*size]);
															
 
																+				}
															
 
																+				else
															
 
																+				{
															
 
																+					printf(".\t");
															
 
																+				}
															
 
																+			}
															
 
																+			printf("\n");
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	int correctness = 1;
															
 
																+	for(x = 0; x < nblocks ;  x++)
															
 
																+	{
															
 
																+		for (y = 0; y < nblocks; y++)
															
 
																+		{
															
 
																+			int mpi_rank = my_distrib(x, y, nodes);
															
 
																+			if (mpi_rank == rank)
															
 
																+			{
															
 
																+				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
															
 
																+				{
															
 
																+					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
															
 
																+					{
															
 
																+						if (i <= j)
															
 
																+						{
															
 
																+							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
															
 
																+							float err = abs(test_mat[j +i*size] - orig);
															
 
																+							if (err > 0.00001)
															
 
																+							{
															
 
																+								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
															
 
																+								correctness = 0;
															
 
																+								flops = 0;
															
 
																+								break;
															
 
																+							}
															
 
																+						}
															
 
																+					}
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	for(x=0 ; x<nblocks ; x++)
															
 
																+	{
															
 
																+		for(y=0 ; y<nblocks ; y++)
															
 
																+		{
															
 
																+			starpu_free((void *)bmat[x][y]);
															
 
																+		}
															
 
																+		free(bmat[x]);
															
 
																+	}
															
 
																+	free(bmat);
															
 
																+	free(rmat);
															
 
																+	free(test_mat);
															
 
																+
															
 
																+	starpu_helper_cublas_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	assert(correctness);
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
															
 
																+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/examples/cholesky/mpi_cholesky.h
+++ b/mpi/examples/cholesky/mpi_cholesky.h
@@ -0,0 +1,106 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __MPI_CHOLESKY_H__
															
 
																+#define __MPI_CHOLESKY_H__
															
 
																+
															
 
																+#include <string.h>
															
 
																+#include <math.h>
															
 
																+#include <sys/time.h>
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#include <cuda.h>
															
 
																+#include <cuda_runtime.h>
															
 
																+#include <cublas.h>
															
 
																+#endif
															
 
																+
															
 
																+#include <common/blas.h>
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#define BLOCKSIZE	(size/nblocks)
															
 
																+
															
 
																+static unsigned size = 4*1024;
															
 
																+static unsigned nblocks = 16;
															
 
																+static unsigned nbigblocks = 2;
															
 
																+static unsigned noprio = 0;
															
 
																+static unsigned display = 0;
															
 
																+static unsigned dblockx = -1;
															
 
																+static unsigned dblocky = -1;
															
 
																+
															
 
																+void chol_cpu_codelet_update_u11(void **, void *);
															
 
																+void chol_cpu_codelet_update_u21(void **, void *);
															
 
																+void chol_cpu_codelet_update_u22(void **, void *);
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
															
 
																+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
															
 
																+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
															
 
																+#endif
															
 
																+
															
 
																+static void __attribute__((unused)) parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++)
															
 
																+	{
															
 
																+		if (strcmp(argv[i], "-size") == 0)
															
 
																+		{
															
 
																+		        char *argptr;
															
 
																+			size = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-dblockx") == 0)
															
 
																+		{
															
 
																+		        char *argptr;
															
 
																+			dblockx = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+		
															
 
																+		if (strcmp(argv[i], "-dblocky") == 0)
															
 
																+		{
															
 
																+		        char *argptr;
															
 
																+			dblocky = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+	
															
 
																+		if (strcmp(argv[i], "-nblocks") == 0)
															
 
																+		{
															
 
																+		        char *argptr;
															
 
																+			nblocks = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-nbigblocks") == 0)
															
 
																+		{
															
 
																+		        char *argptr;
															
 
																+			nbigblocks = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-no-prio") == 0)
															
 
																+		{
															
 
																+			noprio = 1;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-display") == 0)
															
 
																+		{
															
 
																+			display = 1;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-h") == 0)
															
 
																+		{
															
 
																+			printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
															
 
																+		}
															
 
																+	}
															
 
																+	if (nblocks > size) nblocks = size;
															
 
																+}
															
 
																+
															
 
																+#endif // __MPI_CHOLESKY_H__
															
--- a/mpi/examples/cholesky/mpi_cholesky_codelets.c
+++ b/mpi/examples/cholesky/mpi_cholesky_codelets.c
@@ -0,0 +1,174 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "mpi_cholesky.h"
															
 
																+#include "mpi_cholesky_models.h"
															
 
																+#include "mpi_cholesky_codelets.h"
															
 
																+
															
 
																+/*
															
 
																+ *	Create the codelets
															
 
																+ */
															
 
																+
															
 
																+static struct starpu_codelet cl11 =
															
 
																+{
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
															
 
																+#endif
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																+	.model = &chol_model_11
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl21 =
															
 
																+{
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
															
 
																+#endif
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = {STARPU_R, STARPU_RW},
															
 
																+	.model = &chol_model_21
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl22 =
															
 
																+{
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
															
 
																+#endif
															
 
																+	.nbuffers = 3,
															
 
																+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
															
 
																+	.model = &chol_model_22
															
 
																+};
															
 
																+
															
 
																+extern int my_distrib(int x, int y, int nb_nodes);
															
 
																+
															
 
																+/*
															
 
																+ *	code to bootstrap the factorization
															
 
																+ *	and construct the DAG
															
 
																+ */
															
 
																+void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes, double *timing, double *flops)
															
 
																+{
															
 
																+	struct timeval start;
															
 
																+	struct timeval end;
															
 
																+	starpu_data_handle_t **data_handles;
															
 
																+	int x, y;
															
 
																+
															
 
																+	/* create all the DAG nodes */
															
 
																+	unsigned i,j,k;
															
 
																+
															
 
																+	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
															
 
																+	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
															
 
																+
															
 
																+	for(x = 0; x < nblocks ;  x++)
															
 
																+	{
															
 
																+		for (y = 0; y < nblocks; y++)
															
 
																+		{
															
 
																+			int mpi_rank = my_distrib(x, y, nodes);
															
 
																+			if (mpi_rank == rank)
															
 
																+			{
															
 
																+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
															
 
																+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
															
 
																+						ld, size/nblocks, size/nblocks, sizeof(float));
															
 
																+			}
															
 
																+			/* TODO: make better test to only registering what is needed */
															
 
																+			else
															
 
																+			{
															
 
																+				/* I don't own that index, but will need it for my computations */
															
 
																+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
															
 
																+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
															
 
																+						ld, size/nblocks, size/nblocks, sizeof(float));
															
 
																+			}
															
 
																+			if (data_handles[x][y])
															
 
																+			{
															
 
																+				starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																+				starpu_data_set_tag(data_handles[x][y], (y*nblocks)+x);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+	gettimeofday(&start, NULL);
															
 
																+
															
 
																+	for (k = 0; k < nblocks; k++)
															
 
																+	{
															
 
																+		int prio = STARPU_DEFAULT_PRIO;
															
 
																+		if (!noprio) prio = STARPU_MAX_PRIO;
															
 
																+
															
 
																+		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
															
 
																+				STARPU_PRIORITY, prio,
															
 
																+				STARPU_RW, data_handles[k][k],
															
 
																+				0);
															
 
																+
															
 
																+		for (j = k+1; j<nblocks; j++)
															
 
																+		{
															
 
																+			prio = STARPU_DEFAULT_PRIO;
															
 
																+			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
															
 
																+			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
															
 
																+					STARPU_PRIORITY, prio,
															
 
																+					STARPU_R, data_handles[k][k],
															
 
																+					STARPU_RW, data_handles[k][j],
															
 
																+					0);
															
 
																+
															
 
																+			for (i = k+1; i<nblocks; i++)
															
 
																+			{
															
 
																+				if (i <= j)
															
 
																+				{
															
 
																+					prio = STARPU_DEFAULT_PRIO;
															
 
																+					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
															
 
																+					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
															
 
																+							STARPU_PRIORITY, prio,
															
 
																+							STARPU_R, data_handles[k][i],
															
 
																+							STARPU_R, data_handles[k][j],
															
 
																+							STARPU_RW, data_handles[i][j],
															
 
																+							0);
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_task_wait_for_all();
															
 
																+
															
 
																+	for(x = 0; x < nblocks ;  x++)
															
 
																+	{
															
 
																+		for (y = 0; y < nblocks; y++)
															
 
																+		{
															
 
																+			if (data_handles[x][y])
															
 
																+				starpu_data_unregister(data_handles[x][y]);
															
 
																+		}
															
 
																+		free(data_handles[x]);
															
 
																+	}
															
 
																+	free(data_handles);
															
 
																+
															
 
																+	starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+	gettimeofday(&end, NULL);
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
															
 
																+
															
 
																+		double flop = (1.0f*size*size*size)/3.0f;
															
 
																+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	}
															
 
																+}
															
 
																+
															
--- a/mpi/examples/cholesky/mpi_cholesky_codelets.h
+++ b/mpi/examples/cholesky/mpi_cholesky_codelets.h
@@ -0,0 +1,27 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __MPI_CHOLESKY_CODELETS_H__
															
 
																+#define __MPI_CHOLESKY_CODELETS_H__
															
 
																+
															
 
																+/*
															
 
																+ *	code to bootstrap the factorization
															
 
																+ *	and construct the DAG
															
 
																+ */
															
 
																+void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes, double *timing, double *flops);
															
 
																+
															
 
																+#endif /* __MPI_CHOLESKY_CODELETS_H__ */
															
--- a/mpi/examples/cholesky/mpi_cholesky_distributed.c
+++ b/mpi/examples/cholesky/mpi_cholesky_distributed.c
@@ -0,0 +1,117 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "mpi_cholesky.h"
															
 
																+#include "mpi_cholesky_models.h"
															
 
																+#include "mpi_cholesky_codelets.h"
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int y, int nb_nodes)
															
 
																+{
															
 
																+	//return (x+y) % nb_nodes;
															
 
																+	return (x%dblockx)+(y%dblocky)*dblockx;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	/* create a simple definite positive symetric matrix example
															
 
																+	 *
															
 
																+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
															
 
																+	 * */
															
 
																+
															
 
																+	float ***bmat;
															
 
																+	int rank, nodes, ret;
															
 
																+
															
 
																+	parse_args(argc, argv);
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	starpu_mpi_initialize_extended(&rank, &nodes);
															
 
																+	starpu_helper_cublas_init();
															
 
																+
															
 
																+	if (dblockx == -1 || dblocky == -1)
															
 
																+	{
															
 
																+	     int factor;
															
 
																+	     dblockx = nodes;
															
 
																+	     dblocky = 1;
															
 
																+	     for(factor=sqrt(nodes) ; factor>1 ; factor--)
															
 
																+	     {
															
 
																+		  if (nodes % factor == 0)
															
 
																+		  {
															
 
																+		       dblockx = nodes/factor;
															
 
																+		       dblocky = factor;
															
 
																+		       break;
															
 
																+		  }
															
 
																+	     }
															
 
																+	}
															
 
																+
															
 
																+	unsigned i,j,x,y;
															
 
																+	bmat = malloc(nblocks * sizeof(float *));
															
 
																+	for(x=0 ; x<nblocks ; x++)
															
 
																+	{
															
 
																+		bmat[x] = malloc(nblocks * sizeof(float *));
															
 
																+		for(y=0 ; y<nblocks ; y++)
															
 
																+		{
															
 
																+			int mpi_rank = my_distrib(x, y, nodes);
															
 
																+			if (mpi_rank == rank)
															
 
																+			{
															
 
																+				starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
															
 
																+				for (i = 0; i < BLOCKSIZE; i++)
															
 
																+				{
															
 
																+					for (j = 0; j < BLOCKSIZE; j++)
															
 
																+					{
															
 
																+						bmat[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
															
 
																+						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
															
 
																+					}
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	double timing, flops;
															
 
																+	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
															
 
																+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	for(x=0 ; x<nblocks ; x++)
															
 
																+	{
															
 
																+		for(y=0 ; y<nblocks ; y++)
															
 
																+		{
															
 
																+			int mpi_rank = my_distrib(x, y, nodes);
															
 
																+			if (mpi_rank == rank)
															
 
																+			{
															
 
																+				starpu_free((void *)bmat[x][y]);
															
 
																+			}
															
 
																+		}
															
 
																+		free(bmat[x]);
															
 
																+	}
															
 
																+	free(bmat);
															
 
																+
															
 
																+	starpu_helper_cublas_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/examples/cholesky/mpi_cholesky_kernels.c
+++ b/mpi/examples/cholesky/mpi_cholesky_kernels.c
@@ -0,0 +1,236 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include "mpi_cholesky.h"
															
 
																+#include "common/blas.h"
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+#include <cuda.h>
															
 
																+#include <cuda_runtime.h>
															
 
																+#include <cublas.h>
															
 
																+#ifdef STARPU_HAVE_MAGMA
															
 
																+#include "magma.h"
															
 
																+#include "magma_lapack.h"
															
 
																+#endif
															
 
																+#endif
															
 
																+
															
 
																+/*
															
 
																+ *   U22
															
 
																+ */
															
 
																+
															
 
																+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
															
 
																+{
															
 
																+	//printf("22\n");
															
 
																+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
															
 
																+
															
 
																+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
															
 
																+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
															
 
																+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+
															
 
																+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cublasStatus st;
															
 
																+#endif
															
 
																+
															
 
																+	switch (s)
															
 
																+	{
															
 
																+		case 0:
															
 
																+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
															
 
																+				right, ld12, 1.0f, center, ld22);
															
 
																+			break;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+		case 1:
															
 
																+			cublasSgemm('n', 't', dy, dx, dz,
															
 
																+					-1.0f, left, ld21, right, ld12,
															
 
																+					 1.0f, center, ld22);
															
 
																+			st = cublasGetError();
															
 
																+			if (STARPU_UNLIKELY(st != CUBLAS_STATUS_SUCCESS))
															
 
																+				STARPU_CUBLAS_REPORT_ERROR(st);
															
 
																+
															
 
																+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+
															
 
																+			break;
															
 
																+#endif
															
 
																+		default:
															
 
																+			STARPU_ABORT();
															
 
																+			break;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
															
 
																+{
															
 
																+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
															
 
																+{
															
 
																+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
															
 
																+}
															
 
																+#endif// STARPU_USE_CUDA
															
 
																+
															
 
																+/*
															
 
																+ * U21
															
 
																+ */
															
 
																+
															
 
																+static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
															
 
																+{
															
 
																+//	printf("21\n");
															
 
																+	float *sub11;
															
 
																+	float *sub21;
															
 
																+
															
 
																+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+
															
 
																+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+
															
 
																+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
															
 
																+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
															
 
																+
															
 
																+	switch (s)
															
 
																+	{
															
 
																+		case 0:
															
 
																+			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
															
 
																+			break;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+		case 1:
															
 
																+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
															
 
																+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+			break;
															
 
																+#endif
															
 
																+		default:
															
 
																+			STARPU_ABORT();
															
 
																+			break;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
															
 
																+{
															
 
																+	 chol_common_codelet_update_u21(descr, 0, _args);
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
															
 
																+{
															
 
																+	chol_common_codelet_update_u21(descr, 1, _args);
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+/*
															
 
																+ *	U11
															
 
																+ */
															
 
																+
															
 
																+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args)
															
 
																+{
															
 
																+//	printf("11\n");
															
 
																+	float *sub11;
															
 
																+
															
 
																+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+
															
 
																+	unsigned z;
															
 
																+
															
 
																+	switch (s)
															
 
																+	{
															
 
																+		case 0:
															
 
																+
															
 
																+			/*
															
 
																+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
															
 
																+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
															
 
																+			 *	- A22 <- A22 - l21 trans(l21)
															
 
																+			 */
															
 
																+
															
 
																+			for (z = 0; z < nx; z++)
															
 
																+			{
															
 
																+				float lambda11;
															
 
																+				lambda11 = sqrt(sub11[z+z*ld]);
															
 
																+				sub11[z+z*ld] = lambda11;
															
 
																+
															
 
																+				STARPU_ASSERT(lambda11 != 0.0f);
															
 
																+
															
 
																+				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
															
 
																+
															
 
																+				SSYR("L", nx - z - 1, -1.0f,
															
 
																+							&sub11[(z+1)+z*ld], 1,
															
 
																+							&sub11[(z+1)+(z+1)*ld], ld);
															
 
																+			}
															
 
																+			break;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+		case 1:
															
 
																+#ifdef STARPU_HAVE_MAGMA
															
 
																+			{
															
 
																+				int ret;
															
 
																+				int info;
															
 
																+				ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
															
 
																+				if (ret != MAGMA_SUCCESS)
															
 
																+				{
															
 
																+					fprintf(stderr, "Error in Magma: %d\n", ret);
															
 
																+					STARPU_ABORT();
															
 
																+				}
															
 
																+				cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+				STARPU_ASSERT(!cures);
															
 
																+			}
															
 
																+#else
															
 
																+			for (z = 0; z < nx; z++)
															
 
																+			{
															
 
																+				float lambda11;
															
 
																+				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+
															
 
																+				STARPU_ASSERT(lambda11 != 0.0f);
															
 
																+
															
 
																+				lambda11 = sqrt(lambda11);
															
 
																+
															
 
																+				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
															
 
																+
															
 
																+				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
															
 
																+
															
 
																+				cublasSsyr('U', nx - z - 1, -1.0f,
															
 
																+							&sub11[(z+1)+z*ld], 1,
															
 
																+							&sub11[(z+1)+(z+1)*ld], ld);
															
 
																+			}
															
 
																+
															
 
																+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+#endif
															
 
																+			break;
															
 
																+#endif
															
 
																+		default:
															
 
																+			STARPU_ABORT();
															
 
																+			break;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+
															
 
																+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
															
 
																+{
															
 
																+	chol_common_codelet_update_u11(descr, 0, _args);
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
															
 
																+{
															
 
																+	chol_common_codelet_update_u11(descr, 1, _args);
															
 
																+}
															
 
																+#endif// STARPU_USE_CUDA
															
--- a/mpi/examples/cholesky/mpi_cholesky_models.c
+++ b/mpi/examples/cholesky/mpi_cholesky_models.c
@@ -0,0 +1,40 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_cholesky_models.h"
															
 
																+
															
 
																+/*
															
 
																+ *	Number of flops of Gemm
															
 
																+ */
															
 
																+
															
 
																+struct starpu_perfmodel chol_model_11 =
															
 
																+{
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = "chol_model_11"
															
 
																+};
															
 
																+
															
 
																+struct starpu_perfmodel chol_model_21 =
															
 
																+{
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = "chol_model_21"
															
 
																+};
															
 
																+
															
 
																+struct starpu_perfmodel chol_model_22 =
															
 
																+{
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = "chol_model_22"
															
 
																+};
															
--- a/mpi/examples/cholesky/mpi_cholesky_models.h
+++ b/mpi/examples/cholesky/mpi_cholesky_models.h
@@ -0,0 +1,27 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __DW_CHOLESKY_MODELS_H__
															
 
																+#define __DW_CHOLESKY_MODELS_H__
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+extern struct starpu_perfmodel chol_model_11;
															
 
																+extern struct starpu_perfmodel chol_model_21;
															
 
																+extern struct starpu_perfmodel chol_model_22;
															
 
																+
															
 
																+#endif // __DW_CHOLESKY_MODELS_H__
															
--- a/mpi/examples/complex/mpi_complex.c
+++ b/mpi/examples/complex/mpi_complex.c
@@ -0,0 +1,75 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <interface/complex_interface.h>
															
 
																+#include <interface/complex_codelet.h>
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int rank, nodes;
															
 
																+	int ret;
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	starpu_mpi_initialize_extended(&rank, &nodes);
															
 
																+
															
 
																+	if (nodes < 2)
															
 
																+	{
															
 
																+		fprintf(stderr, "This program needs at least 2 nodes\n");
															
 
																+		ret = 77;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																+			double real[2] = {4.0, 2.0};
															
 
																+			double imaginary[2] = {7.0, 9.0};
															
 
																+			starpu_data_handle_t handle;
															
 
																+
															
 
																+			double real2[2] = {14.0, 12.0};
															
 
																+			double imaginary2[2] = {17.0, 19.0};
															
 
																+			starpu_data_handle_t handle2;
															
 
																+			MPI_Status status;
															
 
																+
															
 
																+			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
															
 
																+			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
															
 
																+			starpu_mpi_send(handle, 1, 10, MPI_COMM_WORLD);
															
 
																+
															
 
																+			starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
															
 
																+			starpu_mpi_recv(handle2, 1, 11, MPI_COMM_WORLD, &status);
															
 
																+			starpu_insert_task(&cl_display, STARPU_R, handle2, 0);
															
 
																+			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, 0);
															
 
																+		}
															
 
																+		else if (rank == 1)
															
 
																+		{
															
 
																+			double real[2] = {0.0, 0.0};
															
 
																+			double imaginary[2] = {0.0, 0.0};
															
 
																+			starpu_data_handle_t handle;
															
 
																+			MPI_Status status;
															
 
																+
															
 
																+			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
															
 
																+			starpu_mpi_recv(handle, 0, 10, MPI_COMM_WORLD, &status);
															
 
																+			starpu_insert_task(&cl_display, STARPU_R, handle, 0);
															
 
																+			starpu_mpi_send(handle, 0, 11, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+	}
															
 
																+	starpu_task_wait_for_all();
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
--- a/mpi/examples/mpi_lu/mpi_lu-double.h
+++ b/mpi/examples/mpi_lu/mpi_lu-double.h
@@ -0,0 +1,42 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#define TYPE double
															
 
																+#define MPI_TYPE	MPI_DOUBLE
															
 
																+
															
 
																+#define STARPU_PLU(name)       starpu_pdlu_##name
															
 
																+
															
 
																+#define CUBLAS_GEMM	cublasDgemm
															
 
																+#define CUBLAS_TRSM	cublasDtrsm
															
 
																+#define CUBLAS_SCAL	cublasDscal
															
 
																+#define CUBLAS_GER	cublasDger
															
 
																+#define CUBLAS_SWAP	cublasDswap
															
 
																+#define CUBLAS_IAMAX	cublasIdamax
															
 
																+
															
 
																+#define CPU_GEMM	DGEMM
															
 
																+#define CPU_GEMV	DGEMV
															
 
																+#define CPU_TRSM	DTRSM
															
 
																+#define CPU_SCAL	DSCAL
															
 
																+#define CPU_GER		DGER
															
 
																+#define CPU_SWAP	DSWAP
															
 
																+
															
 
																+#define CPU_TRMM	DTRMM
															
 
																+#define CPU_AXPY	DAXPY
															
 
																+#define CPU_ASUM	DASUM
															
 
																+#define CPU_IAMAX	IDAMAX
															
 
																+
															
 
																+#define PIVOT_THRESHHOLD	10e-10
															
--- a/mpi/examples/mpi_lu/mpi_lu-float.h
+++ b/mpi/examples/mpi_lu/mpi_lu-float.h
@@ -0,0 +1,42 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#define TYPE float
															
 
																+#define MPI_TYPE	MPI_FLOAT
															
 
																+
															
 
																+#define STARPU_PLU(name)       starpu_pslu_##name
															
 
																+
															
 
																+#define CUBLAS_GEMM	cublasSgemm
															
 
																+#define CUBLAS_TRSM	cublasStrsm
															
 
																+#define CUBLAS_SCAL	cublasSscal
															
 
																+#define CUBLAS_GER	cublasSger
															
 
																+#define CUBLAS_SWAP	cublasSswap
															
 
																+#define CUBLAS_IAMAX	cublasIsamax
															
 
																+
															
 
																+#define CPU_GEMM	SGEMM
															
 
																+#define CPU_GEMV	SGEMV
															
 
																+#define CPU_TRSM	STRSM
															
 
																+#define CPU_SCAL	SSCAL
															
 
																+#define CPU_GER		SGER
															
 
																+#define CPU_SWAP	SSWAP
															
 
																+
															
 
																+#define CPU_TRMM	STRMM
															
 
																+#define CPU_AXPY	SAXPY
															
 
																+#define CPU_ASUM	SASUM
															
 
																+#define CPU_IAMAX	ISAMAX
															
 
																+
															
 
																+#define PIVOT_THRESHHOLD	10e-5
															
--- a/mpi/examples/mpi_lu/pdlu.c
+++ b/mpi/examples/mpi_lu/pdlu.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-double.h"
															
 
																+#include "pxlu.c"
															
--- a/mpi/examples/mpi_lu/pdlu_kernels.c
+++ b/mpi/examples/mpi_lu/pdlu_kernels.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-double.h"
															
 
																+#include "pxlu_kernels.c"
															
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -0,0 +1,577 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <stdlib.h>
															
 
																+#include <stdio.h>
															
 
																+#include <string.h>
															
 
																+#include <time.h>
															
 
																+#include <math.h>
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#include "pxlu.h"
															
 
																+//#include "pxlu_kernels.h"
															
 
																+
															
 
																+#ifdef STARPU_HAVE_LIBNUMA
															
 
																+#include <numaif.h>
															
 
																+#endif
															
 
																+
															
 
																+static unsigned long size = 16384;
															
 
																+static unsigned nblocks = 16;
															
 
																+static unsigned check = 0;
															
 
																+static unsigned p = 1;
															
 
																+static unsigned q = 1;
															
 
																+static unsigned display = 0;
															
 
																+
															
 
																+#ifdef STARPU_HAVE_LIBNUMA
															
 
																+static unsigned numa = 0;
															
 
																+#endif
															
 
																+
															
 
																+static size_t allocated_memory = 0;
															
 
																+static size_t allocated_memory_extra = 0;
															
 
																+
															
 
																+static starpu_data_handle_t *dataA_handles;
															
 
																+static TYPE **dataA;
															
 
																+
															
 
																+/* In order to implement the distributed LU decomposition, we allocate
															
 
																+ * temporary buffers */
															
 
																+#ifdef SINGLE_TMP11
															
 
																+static starpu_data_handle_t tmp_11_block_handle;
															
 
																+static TYPE *tmp_11_block;
															
 
																+#else
															
 
																+static starpu_data_handle_t *tmp_11_block_handles;
															
 
																+static TYPE **tmp_11_block;
															
 
																+#endif
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+static starpu_data_handle_t *tmp_12_block_handles;
															
 
																+static TYPE **tmp_12_block;
															
 
																+static starpu_data_handle_t *tmp_21_block_handles;
															
 
																+static TYPE **tmp_21_block;
															
 
																+#else
															
 
																+static starpu_data_handle_t *(tmp_12_block_handles[2]);
															
 
																+static TYPE **(tmp_12_block[2]);
															
 
																+static starpu_data_handle_t *(tmp_21_block_handles[2]);
															
 
																+static TYPE **(tmp_21_block[2]);
															
 
																+#endif
															
 
																+
															
 
																+int get_block_rank(unsigned i, unsigned j);
															
 
																+
															
 
																+static void parse_args(int rank, int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++) {
															
 
																+		if (strcmp(argv[i], "-size") == 0) {
															
 
																+			char *argptr;
															
 
																+			size = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-nblocks") == 0) {
															
 
																+			char *argptr;
															
 
																+			nblocks = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-check") == 0) {
															
 
																+			check = 1;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-display") == 0) {
															
 
																+			display = 1;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-numa") == 0) {
															
 
																+#ifdef STARPU_HAVE_LIBNUMA
															
 
																+			numa = 1;
															
 
																+#else
															
 
																+			if (rank == 0)
															
 
																+				fprintf(stderr, "Warning: libnuma is not available\n");
															
 
																+#endif
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-p") == 0) {
															
 
																+			char *argptr;
															
 
																+			p = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-q") == 0) {
															
 
																+			char *argptr;
															
 
																+			q = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+unsigned STARPU_PLU(display_flag)(void)
															
 
																+{
															
 
																+	return display;
															
 
																+}
															
 
																+
															
 
																+static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nblocks)
															
 
																+{
															
 
																+	const unsigned block_size = (size/nblocks);
															
 
																+
															
 
																+	unsigned i, j;
															
 
																+	for (i = 0; i < block_size; i++)
															
 
																+	for (j = 0; j < block_size; j++)
															
 
																+	{
															
 
																+		blockptr[j+i*block_size] = (TYPE)starpu_drand48();
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+#ifdef SINGLE_TMP11
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void)
															
 
																+{
															
 
																+	return tmp_11_block_handle;
															
 
																+}
															
 
																+#else
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k)
															
 
																+{
															
 
																+	return tmp_11_block_handles[k];
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
															
 
																+{
															
 
																+	return tmp_12_block_handles[j];
															
 
																+}
															
 
																+
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
															
 
																+{
															
 
																+	return tmp_21_block_handles[i];
															
 
																+}
															
 
																+#else
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k)
															
 
																+{
															
 
																+	return tmp_12_block_handles[k%2][j];
															
 
																+}
															
 
																+
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k)
															
 
																+{
															
 
																+	return tmp_21_block_handles[k%2][i];
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+static unsigned tmp_11_block_is_needed(int rank, unsigned nblocks, unsigned k)
															
 
																+{
															
 
																+	return 1;
															
 
																+}
															
 
																+
															
 
																+static unsigned tmp_12_block_is_needed(int rank, unsigned nblocks, unsigned j)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+	for (i = 1; i < nblocks; i++)
															
 
																+	{
															
 
																+		if (get_block_rank(i, j) == rank)
															
 
																+			return 1;
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static unsigned tmp_21_block_is_needed(int rank, unsigned nblocks, unsigned i)
															
 
																+{
															
 
																+	unsigned j;
															
 
																+	for (j = 1; j < nblocks; j++)
															
 
																+	{
															
 
																+		if (get_block_rank(i, j) == rank)
															
 
																+			return 1;
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static void init_matrix(int rank)
															
 
																+{
															
 
																+#ifdef STARPU_HAVE_LIBNUMA
															
 
																+	if (numa)
															
 
																+	{
															
 
																+		fprintf(stderr, "Using INTERLEAVE policy\n");
															
 
																+		unsigned long nodemask = ((1<<0)|(1<<1));
															
 
																+		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
															
 
																+		if (ret)
															
 
																+			perror("set_mempolicy failed");
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
															
 
																+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
															
 
																+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
															
 
																+	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
															
 
																+
															
 
																+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
															
 
																+
															
 
																+	/* Allocate all the blocks that belong to this mpi node */
															
 
																+	unsigned long i,j;
															
 
																+	for (j = 0; j < nblocks; j++)
															
 
																+	{
															
 
																+		for (i = 0; i < nblocks; i++)
															
 
																+		{
															
 
																+			TYPE **blockptr = &dataA[j+i*nblocks];
															
 
																+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
															
 
																+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
															
 
																+
															
 
																+			if (get_block_rank(i, j) == rank)
															
 
																+			{
															
 
																+				/* This blocks should be treated by the current MPI process */
															
 
																+				/* Allocate and fill it */
															
 
																+				starpu_malloc((void **)blockptr, blocksize);
															
 
																+				allocated_memory += blocksize;
															
 
																+
															
 
																+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
															
 
																+				fill_block_with_random(*blockptr, size, nblocks);
															
 
																+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
															
 
																+				if (i == j)
															
 
																+				{
															
 
																+					unsigned tmp;
															
 
																+					for (tmp = 0; tmp < size/nblocks; tmp++)
															
 
																+					{
															
 
																+						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
															
 
																+					}
															
 
																+				}
															
 
																+
															
 
																+				/* Register it to StarPU */
															
 
																+				starpu_matrix_data_register(handleptr, 0,
															
 
																+					(uintptr_t)*blockptr, size/nblocks,
															
 
																+					size/nblocks, size/nblocks, sizeof(TYPE));
															
 
																+			}
															
 
																+			else {
															
 
																+				*blockptr = STARPU_POISON_PTR;
															
 
																+				*handleptr = STARPU_POISON_PTR;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Allocate the temporary buffers required for the distributed algorithm */
															
 
																+
															
 
																+	unsigned k;
															
 
																+
															
 
																+	/* tmp buffer 11 */
															
 
																+#ifdef SINGLE_TMP11
															
 
																+	starpu_malloc((void **)&tmp_11_block, blocksize);
															
 
																+	allocated_memory_extra += blocksize;
															
 
																+	starpu_matrix_data_register(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
															
 
																+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
															
 
																+#else
															
 
																+	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
															
 
																+	tmp_11_block = calloc(nblocks, sizeof(TYPE *));
															
 
																+	allocated_memory_extra += nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
															
 
																+
															
 
																+	for (k = 0; k < nblocks; k++)
															
 
																+	{
															
 
																+		if (tmp_11_block_is_needed(rank, nblocks, k))
															
 
																+		{
															
 
																+			starpu_malloc((void **)&tmp_11_block[k], blocksize);
															
 
																+			allocated_memory_extra += blocksize;
															
 
																+			STARPU_ASSERT(tmp_11_block[k]);
															
 
																+
															
 
																+			starpu_matrix_data_register(&tmp_11_block_handles[k], 0,
															
 
																+				(uintptr_t)tmp_11_block[k],
															
 
																+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																+	/* tmp buffers 12 and 21 */
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+	tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
															
 
																+	tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
															
 
																+	tmp_12_block = calloc(nblocks, sizeof(TYPE *));
															
 
																+	tmp_21_block = calloc(nblocks, sizeof(TYPE *));
															
 
																+
															
 
																+	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
															
 
																+#else
															
 
																+	for (i = 0; i < 2; i++) {
															
 
																+		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
															
 
																+		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
															
 
																+		tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *));
															
 
																+		tmp_21_block[i] = calloc(nblocks, sizeof(TYPE *));
															
 
																+
															
 
																+		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
															
 
																+	}
															
 
																+#endif
															
 
																+	
															
 
																+	for (k = 0; k < nblocks; k++)
															
 
																+	{
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+		if (tmp_12_block_is_needed(rank, nblocks, k))
															
 
																+		{
															
 
																+			starpu_malloc((void **)&tmp_12_block[k], blocksize);
															
 
																+			allocated_memory_extra += blocksize;
															
 
																+			STARPU_ASSERT(tmp_12_block[k]);
															
 
																+
															
 
																+			starpu_matrix_data_register(&tmp_12_block_handles[k], 0,
															
 
																+				(uintptr_t)tmp_12_block[k],
															
 
																+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
															
 
																+		}
															
 
																+
															
 
																+		if (tmp_21_block_is_needed(rank, nblocks, k))
															
 
																+		{
															
 
																+			starpu_malloc((void **)&tmp_21_block[k], blocksize);
															
 
																+			allocated_memory_extra += blocksize;
															
 
																+			STARPU_ASSERT(tmp_21_block[k]);
															
 
																+
															
 
																+			starpu_matrix_data_register(&tmp_21_block_handles[k], 0,
															
 
																+				(uintptr_t)tmp_21_block[k],
															
 
																+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
															
 
																+		}
															
 
																+#else
															
 
																+	for (i = 0; i < 2; i++) {
															
 
																+		if (tmp_12_block_is_needed(rank, nblocks, k))
															
 
																+		{
															
 
																+			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
															
 
																+			allocated_memory_extra += blocksize;
															
 
																+			STARPU_ASSERT(tmp_12_block[i][k]);
															
 
																+	
															
 
																+			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
															
 
																+				(uintptr_t)tmp_12_block[i][k],
															
 
																+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
															
 
																+		}
															
 
																+
															
 
																+		if (tmp_21_block_is_needed(rank, nblocks, k))
															
 
																+		{
															
 
																+			starpu_malloc((void **)&tmp_21_block[i][k], blocksize);
															
 
																+			allocated_memory_extra += blocksize;
															
 
																+			STARPU_ASSERT(tmp_21_block[i][k]);
															
 
																+	
															
 
																+			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
															
 
																+				(uintptr_t)tmp_21_block[i][k],
															
 
																+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+	}
															
 
																+
															
 
																+	//display_all_blocks(nblocks, size/nblocks);
															
 
																+}
															
 
																+
															
 
																+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
															
 
																+{
															
 
																+	return dataA[j+i*nblocks];
															
 
																+}
															
 
																+
															
 
																+int get_block_rank(unsigned i, unsigned j)
															
 
																+{
															
 
																+	/* Take a 2D block cyclic distribution */
															
 
																+	/* NB: p (resp. q) is for "direction" i (resp. j) */
															
 
																+	return (j % q) * p + (i % p);
															
 
																+}
															
 
																+
															
 
																+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
															
 
																+{
															
 
																+	return dataA_handles[j+i*nblocks];
															
 
																+}
															
 
																+
															
 
																+static void display_grid(int rank, unsigned nblocks)
															
 
																+{
															
 
																+	if (!display)
															
 
																+		return;
															
 
																+
															
 
																+	//if (rank == 0)
															
 
																+	{
															
 
																+		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
															
 
																+		
															
 
																+		unsigned i, j;
															
 
																+		for (j = 0; j < nblocks; j++)
															
 
																+		{
															
 
																+			for (i = 0; i < nblocks; i++)
															
 
																+			{
															
 
																+				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
															
 
																+				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
															
 
																+
															
 
																+				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
															
 
																+			}
															
 
																+			fprintf(stderr, "\n");
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int rank;
															
 
																+	int world_size;
															
 
																+
															
 
																+#if 0
															
 
																+	/*
															
 
																+	 *	Initialization
															
 
																+	 */
															
 
																+	int thread_support;
															
 
																+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
															
 
																+		fprintf(stderr,"MPI_Init_thread failed\n");
															
 
																+		exit(1);
															
 
																+	}
															
 
																+	if (thread_support == MPI_THREAD_FUNNELED)
															
 
																+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
															
 
																+	if (thread_support < MPI_THREAD_FUNNELED)
															
 
																+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
															
 
																+	
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
															
 
																+#endif
															
 
																+
															
 
																+	starpu_srand48((long int)time(NULL));
															
 
																+
															
 
																+	parse_args(rank, argc, argv);
															
 
																+
															
 
																+	int ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+	/* We disable sequential consistency in this example */
															
 
																+	starpu_data_set_default_sequential_consistency_flag(0);
															
 
																+
															
 
																+	starpu_mpi_initialize_extended(&rank, &world_size);
															
 
																+
															
 
																+	STARPU_ASSERT(p*q == world_size);
															
 
																+
															
 
																+	starpu_helper_cublas_init();
															
 
																+
															
 
																+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	/*
															
 
																+	 * 	Problem Init
															
 
																+	 */
															
 
																+
															
 
																+	init_matrix(rank);
															
 
																+
															
 
																+	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
															
 
																+                        (int)allocated_memory/(1024*1024),
															
 
																+			(int)allocated_memory_extra/(1024*1024),
															
 
																+                        (int)(allocated_memory+allocated_memory_extra)/(1024*1024));
															
 
																+
															
 
																+	display_grid(rank, nblocks);
															
 
																+
															
 
																+	TYPE *a_r = NULL;
															
 
																+//	STARPU_PLU(display_data_content)(a_r, size);
															
 
																+
															
 
																+	TYPE *x, *y;
															
 
																+
															
 
																+	if (check)
															
 
																+	{
															
 
																+		x = calloc(size, sizeof(TYPE));
															
 
																+		STARPU_ASSERT(x);
															
 
																+
															
 
																+		y = calloc(size, sizeof(TYPE));
															
 
																+		STARPU_ASSERT(y);
															
 
																+
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																+			unsigned ind;
															
 
																+			for (ind = 0; ind < size; ind++)
															
 
																+				x[ind] = (TYPE)starpu_drand48();
															
 
																+		}
															
 
																+
															
 
																+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
															
 
																+
															
 
																+		if (rank == 0)
															
 
																+			STARPU_PLU(display_data_content)(a_r, size);
															
 
																+
															
 
																+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
															
 
																+	}
															
 
																+
															
 
																+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
															
 
																+
															
 
																+	/*
															
 
																+	 * 	Report performance
															
 
																+	 */
															
 
																+
															
 
																+	int reduce_ret;
															
 
																+	double min_timing = timing;
															
 
																+	double max_timing = timing;
															
 
																+	double sum_timing = timing;
															
 
																+
															
 
																+	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		fprintf(stderr, "Computation took: %f ms\n", max_timing/1000);
															
 
																+		fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000);
															
 
																+		fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000);
															
 
																+		fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
															
 
																+
															
 
																+		unsigned n = size;
															
 
																+		double flop = (2.0f*n*n*n)/3.0f;
															
 
																+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f));
															
 
																+	}
															
 
																+
															
 
																+	/*
															
 
																+	 *	Test Result Correctness
															
 
																+	 */
															
 
																+
															
 
																+	if (check)
															
 
																+	{
															
 
																+		/*
															
 
																+		 *	Compute || A - LU ||
															
 
																+		 */
															
 
																+
															
 
																+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
															
 
																+
															
 
																+#if 0
															
 
																+		/*
															
 
																+		 *	Compute || Ax - LUx ||
															
 
																+		 */
															
 
																+
															
 
																+		unsigned ind;
															
 
																+
															
 
																+		y2 = calloc(size, sizeof(TYPE));
															
 
																+		STARPU_ASSERT(y);
															
 
																+		
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																+			for (ind = 0; ind < size; ind++)
															
 
																+			{
															
 
																+				y2[ind] = (TYPE)0.0;
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
															
 
																+
															
 
																+		/* Compute y2 = y2 - y */
															
 
																+	        CPU_AXPY(size, -1.0, y, 1, y2, 1);
															
 
																+	
															
 
																+	        TYPE err = CPU_ASUM(size, y2, 1);
															
 
																+	        int max = CPU_IAMAX(size, y2, 1);
															
 
																+	
															
 
																+	        fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
															
 
																+	        fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
															
 
																+#endif
															
 
																+	}
															
 
																+
															
 
																+	/*
															
 
																+	 * 	Termination
															
 
																+	 */
															
 
																+
															
 
																+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	starpu_helper_cublas_shutdown();
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+#if 0
															
 
																+	MPI_Finalize();
															
 
																+#endif
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/examples/mpi_lu/plu_example_double.c
+++ b/mpi/examples/mpi_lu/plu_example_double.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-double.h"
															
 
																+#include "plu_example.c"
															
--- a/mpi/examples/mpi_lu/plu_example_float.c
+++ b/mpi/examples/mpi_lu/plu_example_float.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-float.h"
															
 
																+#include "plu_example.c"
															
--- a/mpi/examples/mpi_lu/plu_solve.c
+++ b/mpi/examples/mpi_lu/plu_solve.c
@@ -0,0 +1,394 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <math.h>
															
 
																+#include "pxlu.h"
															
 
																+
															
 
																+/*
															
 
																+ *	Various useful functions
															
 
																+ */
															
 
																+
															
 
																+static double frobenius_norm(TYPE *v, unsigned n)
															
 
																+{
															
 
																+        double sum2 = 0.0;
															
 
																+
															
 
																+        /* compute sqrt(Sum(|x|^2)) */
															
 
																+
															
 
																+        unsigned i,j;
															
 
																+        for (j = 0; j < n; j++)
															
 
																+        for (i = 0; i < n; i++)
															
 
																+        {
															
 
																+                double a = fabsl((double)v[i+n*j]);
															
 
																+                sum2 += a*a;
															
 
																+        }
															
 
																+
															
 
																+        return sqrt(sum2);
															
 
																+}
															
 
																+
															
 
																+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
															
 
																+{
															
 
																+	if (!STARPU_PLU(display_flag)())
															
 
																+		return;
															
 
																+
															
 
																+	fprintf(stderr, "DISPLAY BLOCK\n");
															
 
																+
															
 
																+	unsigned i, j;
															
 
																+	for (j = 0; j < blocksize; j++)
															
 
																+	{
															
 
																+		for (i = 0; i < blocksize; i++)
															
 
																+		{
															
 
																+			fprintf(stderr, "%f ", data[j+i*blocksize]);
															
 
																+		}
															
 
																+		fprintf(stderr, "\n");
															
 
																+	}
															
 
																+
															
 
																+	fprintf(stderr, "****\n");
															
 
																+}
															
 
																+
															
 
																+void STARPU_PLU(extract_upper)(unsigned block_size, TYPE *inblock, TYPE *outblock)
															
 
																+{
															
 
																+	unsigned li, lj;
															
 
																+	for (lj = 0; lj < block_size; lj++)
															
 
																+	{
															
 
																+		/* Upper block diag is 1 */
															
 
																+		outblock[lj*(block_size + 1)] = (TYPE)1.0;
															
 
																+
															
 
																+		for (li = lj + 1; li < block_size; li++)
															
 
																+		{
															
 
																+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outblock)
															
 
																+{
															
 
																+	unsigned li, lj;
															
 
																+	for (lj = 0; lj < block_size; lj++)
															
 
																+	{
															
 
																+		for (li = 0; li <= lj; li++)
															
 
																+		{
															
 
																+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Compute Ax = y
															
 
																+ */
															
 
																+
															
 
																+static void STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
															
 
																+{
															
 
																+	fprintf(stderr, "block data %p sub x %p sub y %p\n", block_data, sub_x, sub_y);
															
 
																+	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
															
 
																+}
															
 
																+
															
 
																+static void STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
															
 
																+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
															
 
																+{
															
 
																+	unsigned block_size = size/nblocks;
															
 
																+
															
 
																+	/* Take a copy of the upper part of the diagonal block */
															
 
																+	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
															
 
																+	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
															
 
																+		
															
 
																+	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
															
 
																+	
															
 
																+	free(upper_block_copy);
															
 
																+}
															
 
																+
															
 
																+static void STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
															
 
																+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
															
 
																+{
															
 
																+	unsigned block_size = size/nblocks;
															
 
																+
															
 
																+	/* Take a copy of the upper part of the diagonal block */
															
 
																+	TYPE *lower_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
															
 
																+	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
															
 
																+
															
 
																+	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
															
 
																+	
															
 
																+	free(lower_block_copy);
															
 
																+}
															
 
																+
															
 
																+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
															
 
																+{
															
 
																+	/* Create temporary buffers where all MPI processes are going to
															
 
																+	 * compute Ui x = yi where Ai is the matrix containing the blocks of U
															
 
																+	 * affected to process i, and 0 everywhere else. We then have y as the
															
 
																+	 * sum of all yi. */
															
 
																+	TYPE *yi = calloc(size, sizeof(TYPE));
															
 
																+
															
 
																+	fprintf(stderr, "Compute LU\n");
															
 
																+
															
 
																+	unsigned block_size = size/nblocks;
															
 
																+
															
 
																+	/* Compute UiX = Yi */
															
 
																+	unsigned long i,j;
															
 
																+	for (j = 0; j < nblocks; j++)
															
 
																+	{
															
 
																+		if (get_block_rank(j, j) == rank)
															
 
																+		{
															
 
																+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
															
 
																+			TYPE *sub_x = &x[j*(block_size)];
															
 
																+			TYPE *sub_yi = &yi[j*(block_size)];
															
 
																+
															
 
																+			STARPU_PLU(compute_ax_block_upper)(size, nblocks, block_data, sub_x, sub_yi);
															
 
																+		}
															
 
																+
															
 
																+		for (i = j + 1; i < nblocks; i++)
															
 
																+		{
															
 
																+			if (get_block_rank(i, j) == rank)
															
 
																+			{
															
 
																+				/* That block belongs to the current MPI process */
															
 
																+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
															
 
																+				TYPE *sub_x = &x[i*(block_size)];
															
 
																+				TYPE *sub_yi = &yi[j*(block_size)];
															
 
																+
															
 
																+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Grab Sum Yi in X */
															
 
																+	MPI_Reduce(yi, x, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
															
 
																+	memset(yi, 0, size*sizeof(TYPE));
															
 
																+
															
 
																+//	unsigned ind;
															
 
																+//	if (rank == 0)
															
 
																+//	{
															
 
																+//		fprintf(stderr, "INTERMEDIATE\n");
															
 
																+//		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
															
 
																+//		{
															
 
																+//			fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]);
															
 
																+//		}
															
 
																+//		fprintf(stderr, "****\n");
															
 
																+//	}
															
 
																+
															
 
																+	/* Everyone needs x */
															
 
																+	int bcst_ret;
															
 
																+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	/* Compute LiX = Yi (with X = UX) */
															
 
																+	for (j = 0; j < nblocks; j++)
															
 
																+	{
															
 
																+		if (j > 0)
															
 
																+		for (i = 0; i < j; i++)
															
 
																+		{
															
 
																+			if (get_block_rank(i, j) == rank)
															
 
																+			{
															
 
																+				/* That block belongs to the current MPI process */
															
 
																+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
															
 
																+				TYPE *sub_x = &x[i*(block_size)];
															
 
																+				TYPE *sub_yi = &yi[j*(block_size)];
															
 
																+
															
 
																+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		if (get_block_rank(j, j) == rank)
															
 
																+		{
															
 
																+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
															
 
																+			TYPE *sub_x = &x[j*(block_size)];
															
 
																+			TYPE *sub_yi = &yi[j*(block_size)];
															
 
																+
															
 
																+			STARPU_PLU(compute_ax_block_lower)(size, nblocks, block_data, sub_x, sub_yi);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Grab Sum Yi in Y */
															
 
																+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
															
 
																+
															
 
																+	free(yi);
															
 
																+}
															
 
																+
															
 
																+
															
 
																+
															
 
																+/*
															
 
																+ *	Allocate a contiguous matrix on node 0 and fill it with the whole
															
 
																+ *	content of the matrix distributed accross all nodes.
															
 
																+ */
															
 
																+
															
 
																+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
															
 
																+{
															
 
																+//	fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks);
															
 
																+
															
 
																+	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
															
 
																+
															
 
																+	unsigned block_size = size/nblocks;
															
 
																+
															
 
																+	int rank;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+
															
 
																+	unsigned bi, bj;
															
 
																+	for (bj = 0; bj < nblocks; bj++)
															
 
																+	for (bi = 0; bi < nblocks; bi++)
															
 
																+	{
															
 
																+		TYPE *block;
															
 
																+
															
 
																+		int block_rank = get_block_rank(bi, bj);
															
 
																+		
															
 
																+		if (block_rank == 0)
															
 
																+		{
															
 
																+			block = STARPU_PLU(get_block)(bi, bj);
															
 
																+		}
															
 
																+		else {
															
 
																+			MPI_Status status;
															
 
																+
															
 
																+			if (rank == 0)
															
 
																+			{
															
 
																+				block = calloc(block_size*block_size, sizeof(TYPE));
															
 
																+
															
 
																+				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status);
															
 
																+				STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+			}
															
 
																+			else if (rank == block_rank) {
															
 
																+				block = STARPU_PLU(get_block)(bi, bj);
															
 
																+				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
															
 
																+				STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																+			unsigned j, i;
															
 
																+			for (j = 0; j < block_size; j++)
															
 
																+			for (i = 0; i < block_size; i++)
															
 
																+			{
															
 
																+				bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
															
 
																+									block[j+i*block_size];
															
 
																+			}
															
 
																+
															
 
																+			if (get_block_rank(bi, bj) != 0)
															
 
																+				free(block);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	return bigmatrix;
															
 
																+}
															
 
																+
															
 
																+/* x and y must be valid (at least) on 0 */
															
 
																+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
															
 
																+{
															
 
																+	unsigned block_size = size/nblocks;
															
 
																+
															
 
																+	/* Send x to everyone */
															
 
																+	int bcst_ret;
															
 
																+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	/* Create temporary buffers where all MPI processes are going to
															
 
																+	 * compute Ai x = yi where Ai is the matrix containing the blocks of A
															
 
																+	 * affected to process i, and 0 everywhere else. We then have y as the
															
 
																+	 * sum of all yi. */
															
 
																+	TYPE *yi = calloc(size, sizeof(TYPE));
															
 
																+
															
 
																+	/* Compute Aix = yi */
															
 
																+	unsigned long i,j;
															
 
																+	for (j = 0; j < nblocks; j++)
															
 
																+	{
															
 
																+		for (i = 0; i < nblocks; i++)
															
 
																+		{
															
 
																+			if (get_block_rank(i, j) == rank)
															
 
																+			{
															
 
																+				/* That block belongs to the current MPI process */
															
 
																+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
															
 
																+				TYPE *sub_x = &x[i*block_size];
															
 
																+				TYPE *sub_yi = &yi[j*block_size];
															
 
																+
															
 
																+				STARPU_PLU(compute_ax_block)(block_size, block_data, sub_x, sub_yi);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Compute the Sum of all yi = y */
															
 
																+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
															
 
																+
															
 
																+	fprintf(stderr, "RANK %d - FOO 1 y[0] %f\n", rank, y[0]);
															
 
																+
															
 
																+	free(yi);
															
 
																+}
															
 
																+
															
 
																+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved)
															
 
																+{
															
 
																+	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
															
 
																+
															
 
																+	unsigned display = STARPU_PLU(display_flag)();
															
 
																+
															
 
																+	int rank;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+	        TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
															
 
																+	        TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
															
 
																+	
															
 
																+	        memset(L, 0, size*size*sizeof(TYPE));
															
 
																+	        memset(U, 0, size*size*sizeof(TYPE));
															
 
																+	
															
 
																+	        /* only keep the lower part */
															
 
																+		unsigned i, j;
															
 
																+	        for (j = 0; j < size; j++)
															
 
																+	        {
															
 
																+	                for (i = 0; i < j; i++)
															
 
																+	                {
															
 
																+	                        L[j+i*size] = all_r[j+i*size];
															
 
																+	                }
															
 
																+	
															
 
																+	                /* diag i = j */
															
 
																+	                L[j+j*size] = all_r[j+j*size];
															
 
																+	                U[j+j*size] = 1.0;
															
 
																+	
															
 
																+	                for (i = j+1; i < size; i++)
															
 
																+	                {
															
 
																+	                        U[j+i*size] = all_r[j+i*size];
															
 
																+	                }
															
 
																+	        }
															
 
																+	
															
 
																+		STARPU_PLU(display_data_content)(L, size);
															
 
																+		STARPU_PLU(display_data_content)(U, size);
															
 
																+	
															
 
																+	        /* now A_err = L, compute L*U */
															
 
																+	        CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
															
 
																+	
															
 
																+		if (display)
															
 
																+			fprintf(stderr, "\nLU\n");
															
 
																+
															
 
																+		STARPU_PLU(display_data_content)(L, size);
															
 
																+	
															
 
																+	        /* compute "LU - A" in L*/
															
 
																+	        CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
															
 
																+	
															
 
																+	        TYPE err = CPU_ASUM(size*size, L, 1);
															
 
																+	        int max = CPU_IAMAX(size*size, L, 1);
															
 
																+	
															
 
																+		if (display)
															
 
																+			fprintf(stderr, "DISPLAY ERROR\n");
															
 
																+
															
 
																+		STARPU_PLU(display_data_content)(L, size);
															
 
																+	
															
 
																+	        fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
															
 
																+	        fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
															
 
																+	
															
 
																+		double residual = frobenius_norm(L, size);
															
 
																+		double matnorm = frobenius_norm(Asaved, size);
															
 
																+	
															
 
																+		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
															
 
																+	}
															
 
																+}
															
 
																+
															
--- a/mpi/examples/mpi_lu/plu_solve_double.c
+++ b/mpi/examples/mpi_lu/plu_solve_double.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-double.h"
															
 
																+#include "plu_solve.c"
															
--- a/mpi/examples/mpi_lu/plu_solve_float.c
+++ b/mpi/examples/mpi_lu/plu_solve_float.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-float.h"
															
 
																+#include "plu_solve.c"
															
--- a/mpi/examples/mpi_lu/pslu.c
+++ b/mpi/examples/mpi_lu/pslu.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-float.h"
															
 
																+#include "pxlu.c"
															
--- a/mpi/examples/mpi_lu/pslu_kernels.c
+++ b/mpi/examples/mpi_lu/pslu_kernels.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-float.h"
															
 
																+#include "pxlu_kernels.c"
															
--- a/mpi/examples/mpi_lu/pxlu.c
+++ b/mpi/examples/mpi_lu/pxlu.c
@@ -0,0 +1,870 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "pxlu.h"
															
 
																+#include "pxlu_kernels.h"
															
 
																+#include <sys/time.h>
															
 
																+
															
 
																+#define MPI_TAG11(k)	((1U << 16) | (k))
															
 
																+#define MPI_TAG12(k, j)	((2U << 16) | (k)<<8 | (j))
															
 
																+#define MPI_TAG21(k, i)	((3U << 16) | (i)<<8 | (k))
															
 
																+
															
 
																+// 11 21
															
 
																+// 12 22
															
 
																+
															
 
																+#define TAG11(k)	((starpu_tag_t)( (1ULL<<50) | (unsigned long long)(k)))
															
 
																+#define TAG12(k,j)	((starpu_tag_t)(((2ULL<<50) | (((unsigned long long)(k))<<32)	\
															
 
																+					| (unsigned long long)(j))))
															
 
																+#define TAG21(k,i)	((starpu_tag_t)(((3ULL<<50) | (((unsigned long long)(k))<<32)	\
															
 
																+					| (unsigned long long)(i))))
															
 
																+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<50) | ((unsigned long long)(k)<<32) 	\
															
 
																+					| ((unsigned long long)(i)<<16)	\
															
 
																+					| (unsigned long long)(j))))
															
 
																+#define TAG11_SAVE(k)	((starpu_tag_t)( (5ULL<<50) | (unsigned long long)(k)))
															
 
																+#define TAG12_SAVE(k,j)	((starpu_tag_t)(((6ULL<<50) | (((unsigned long long)(k))<<32)	\
															
 
																+					| (unsigned long long)(j))))
															
 
																+#define TAG21_SAVE(k,i)	((starpu_tag_t)(((7ULL<<50) | (((unsigned long long)(k))<<32)	\
															
 
																+					| (unsigned long long)(i))))
															
 
																+
															
 
																+#define TAG11_SAVE_PARTIAL(k)	((starpu_tag_t)( (8ULL<<50) | (unsigned long long)(k)))
															
 
																+#define TAG12_SAVE_PARTIAL(k,j)	((starpu_tag_t)(((9ULL<<50) | (((unsigned long long)(k))<<32)	\
															
 
																+					| (unsigned long long)(j))))
															
 
																+#define TAG21_SAVE_PARTIAL(k,i)	((starpu_tag_t)(((10ULL<<50) | (((unsigned long long)(k))<<32)	\
															
 
																+					| (unsigned long long)(i))))
															
 
																+
															
 
																+#define STARPU_TAG_INIT	((starpu_tag_t)(11ULL<<50))
															
 
																+
															
 
																+//#define VERBOSE_INIT	1
															
 
																+
															
 
																+//#define DEBUG	1
															
 
																+
															
 
																+static unsigned no_prio = 0;
															
 
																+
															
 
																+static unsigned nblocks = 0;
															
 
																+static int rank = -1;
															
 
																+static int world_size = -1;
															
 
																+
															
 
																+struct callback_arg {
															
 
																+	unsigned i, j, k;
															
 
																+};
															
 
																+
															
 
																+/*
															
 
																+ *	Various
															
 
																+ */
															
 
																+
															
 
																+static struct debug_info *create_debug_info(unsigned i, unsigned j, unsigned k)
															
 
																+{
															
 
																+	struct debug_info *info = malloc(sizeof(struct debug_info));
															
 
																+
															
 
																+	info->i = i;
															
 
																+	info->j = j;
															
 
																+	info->k = k;
															
 
																+
															
 
																+	return info;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_task *create_task(starpu_tag_t id)
															
 
																+{
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+		task->cl_arg = NULL;
															
 
																+
															
 
																+	task->use_tag = 1;
															
 
																+	task->tag_id = id;
															
 
																+
															
 
																+	return task;
															
 
																+}
															
 
																+
															
 
																+/* Send handle to every node appearing in the mask, and unlock tag once the
															
 
																+ * transfers are done. */
															
 
																+static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
															
 
																+{
															
 
																+	unsigned cnt = 0;
															
 
																+
															
 
																+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
															
 
																+
															
 
																+	int rank_array[world_size];
															
 
																+	MPI_Comm comm_array[world_size];
															
 
																+	int mpi_tag_array[world_size];
															
 
																+	starpu_data_handle_t handle_array[world_size];
															
 
																+
															
 
																+	unsigned r;
															
 
																+	for (r = 0; r < world_size; r++)
															
 
																+	{
															
 
																+		if (rank_mask[r]) {
															
 
																+			rank_array[cnt] = r;
															
 
																+
															
 
																+			comm_array[cnt] = MPI_COMM_WORLD;
															
 
																+			mpi_tag_array[cnt] = mpi_tag;
															
 
																+			handle_array[cnt] = handle;
															
 
																+			cnt++;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	if (cnt == 0)
															
 
																+	{
															
 
																+		/* In case there is no message to send, we release the tag at
															
 
																+		 * once */
															
 
																+		starpu_tag_notify_from_apps(tag);
															
 
																+	}
															
 
																+	else {
															
 
																+		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
															
 
																+				rank_array, mpi_tag_array, comm_array, tag);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+/* Initiate a receive request once all dependencies are fulfilled and unlock
															
 
																+ * tag 'unlocked_tag' once it's done. */
															
 
																+
															
 
																+struct recv_when_done_callback_arg {
															
 
																+	int source;
															
 
																+	int mpi_tag;
															
 
																+	starpu_data_handle_t handle;
															
 
																+	starpu_tag_t unlocked_tag;
															
 
																+};
															
 
																+
															
 
																+static void callback_receive_when_done(void *_arg)
															
 
																+{
															
 
																+	struct recv_when_done_callback_arg *arg = _arg;
															
 
																+
															
 
																+	starpu_mpi_irecv_detached_unlock_tag(arg->handle, arg->source,
															
 
																+			arg->mpi_tag, MPI_COMM_WORLD, arg->unlocked_tag);
															
 
																+
															
 
																+	free(arg);
															
 
																+}
															
 
																+
															
 
																+static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
															
 
																+				int source, int mpi_tag,
															
 
																+				starpu_data_handle_t handle,
															
 
																+				starpu_tag_t partial_tag,
															
 
																+				starpu_tag_t unlocked_tag)
															
 
																+{
															
 
																+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
															
 
																+
															
 
																+	struct recv_when_done_callback_arg *arg =
															
 
																+		malloc(sizeof(struct recv_when_done_callback_arg));
															
 
																+	
															
 
																+	arg->source = source;
															
 
																+	arg->mpi_tag = mpi_tag;
															
 
																+	arg->handle = handle;
															
 
																+	arg->unlocked_tag = unlocked_tag;
															
 
																+
															
 
																+	if (ndeps == 0)
															
 
																+	{
															
 
																+		callback_receive_when_done(arg);
															
 
																+		return;
															
 
																+	}
															
 
																+
															
 
																+	starpu_create_sync_task(partial_tag, ndeps, deps_tags,
															
 
																+					callback_receive_when_done, arg);
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Task 11 (diagonal factorization)
															
 
																+ */
															
 
																+
															
 
																+static void create_task_11_recv(unsigned k)
															
 
																+{
															
 
																+	/* The current node is not computing that task, so we receive the block
															
 
																+	 * with MPI */
															
 
																+
															
 
																+	/* We don't issue a MPI receive request until everyone using the
															
 
																+	 * temporary buffer is done : 11_(k-1) can be used by 12_(k-1)j and
															
 
																+	 * 21(k-1)i with i,j >= k */
															
 
																+	unsigned ndeps = 0;
															
 
																+	starpu_tag_t tag_array[2*nblocks];
															
 
																+	
															
 
																+#ifdef SINGLE_TMP11
															
 
																+	unsigned i, j;
															
 
																+	if (k > 0)
															
 
																+	for (i = (k-1)+1; i < nblocks; i++)
															
 
																+	{
															
 
																+		if (rank == get_block_rank(i, k-1))
															
 
																+			tag_array[ndeps++] = TAG21(k-1, i);
															
 
																+	}
															
 
																+
															
 
																+	if (k > 0)
															
 
																+	for (j = (k-1)+1; j < nblocks; j++)
															
 
																+	{
															
 
																+		if (rank == get_block_rank(k-1, j))
															
 
																+			tag_array[ndeps++] = TAG12(k-1, j);
															
 
																+	}
															
 
																+#endif
															
 
																+	
															
 
																+	int source = get_block_rank(k, k);
															
 
																+#ifdef SINGLE_TMP11
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)();
															
 
																+#else
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
															
 
																+#endif
															
 
																+	int mpi_tag = MPI_TAG11(k);
															
 
																+	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
															
 
																+	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
															
 
																+
															
 
																+//	fprintf(stderr, "NODE %d - 11 (%d) - recv when done ndeps %d - tag array %lx\n", rank, k, ndeps, tag_array[0]);
															
 
																+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
															
 
																+}
															
 
																+
															
 
																+static void find_nodes_using_11(unsigned k, int *rank_mask)
															
 
																+{
															
 
																+	memset(rank_mask, 0, world_size*sizeof(int));
															
 
																+
															
 
																+	/* Block 11_k is used to compute 12_kj + 12ki with i,j > k */
															
 
																+	unsigned i;
															
 
																+	for (i = k+1; i < nblocks; i++)
															
 
																+	{
															
 
																+		int r = get_block_rank(i, k);
															
 
																+		rank_mask[r] = 1;
															
 
																+	}
															
 
																+
															
 
																+	unsigned j;
															
 
																+	for (j = k+1; j < nblocks; j++)
															
 
																+	{
															
 
																+		int r = get_block_rank(k, j);
															
 
																+		rank_mask[r] = 1;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void callback_task_11_real(void *_arg)
															
 
																+{
															
 
																+	struct callback_arg *arg = _arg;
															
 
																+
															
 
																+	unsigned k = arg->k;
															
 
																+
															
 
																+	/* Find all the nodes potentially requiring this block */
															
 
																+	int rank_mask[world_size];
															
 
																+	find_nodes_using_11(k, rank_mask);
															
 
																+	rank_mask[rank] = 0;
															
 
																+
															
 
																+	/* Send the block to those nodes */
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, k);
															
 
																+	starpu_tag_t tag = TAG11_SAVE(k);
															
 
																+	int mpi_tag = MPI_TAG11(k);
															
 
																+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
															
 
																+	
															
 
																+	free(arg);
															
 
																+}
															
 
																+
															
 
																+static void create_task_11_real(unsigned k)
															
 
																+{
															
 
																+	struct starpu_task *task = create_task(TAG11(k));
															
 
																+
															
 
																+	task->cl = &STARPU_PLU(cl11);
															
 
																+
															
 
																+	task->cl_arg = create_debug_info(k, k, k);
															
 
																+
															
 
																+	/* which sub-data is manipulated ? */
															
 
																+	task->handles[0] = STARPU_PLU(get_block_handle)(k, k);
															
 
																+
															
 
																+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
															
 
																+		arg->k = k;
															
 
																+
															
 
																+	task->callback_func = callback_task_11_real;
															
 
																+	task->callback_arg = arg;
															
 
																+
															
 
																+	/* this is an important task */
															
 
																+	if (!no_prio)
															
 
																+		task->priority = STARPU_MAX_PRIO;
															
 
																+
															
 
																+	/* enforce dependencies ... */
															
 
																+	if (k > 0) {
															
 
																+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
															
 
																+	}
															
 
																+	else {
															
 
																+		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
															
 
																+	}
															
 
																+
															
 
																+	int ret = starpu_task_submit(task);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+}
															
 
																+
															
 
																+static void create_task_11(unsigned k)
															
 
																+{
															
 
																+	if (get_block_rank(k, k) == rank)
															
 
																+	{
															
 
																+#ifdef VERBOSE_INIT
															
 
																+		fprintf(stderr, "CREATE real task 11(%d) (TAG11_SAVE(%d) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
															
 
																+#endif
															
 
																+		create_task_11_real(k);
															
 
																+	}
															
 
																+	else {
															
 
																+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
															
 
																+		int rank_mask[world_size];
															
 
																+		find_nodes_using_11(k, rank_mask);
															
 
																+		
															
 
																+		if (rank_mask[rank])
															
 
																+		{
															
 
																+#ifdef VERBOSE_INIT
															
 
																+			fprintf(stderr, "create RECV task 11(%d) on node %d\n", k, rank);
															
 
																+#endif
															
 
																+			create_task_11_recv(k);
															
 
																+		}
															
 
																+		else {
															
 
																+#ifdef VERBOSE_INIT
															
 
																+			fprintf(stderr, "Node %d needs not 11(%d)\n", rank, k);
															
 
																+#endif
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+
															
 
																+
															
 
																+/*
															
 
																+ *	Task 12 (Update lower left (TRSM))
															
 
																+ */
															
 
																+
															
 
																+static void create_task_12_recv(unsigned k, unsigned j)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+
															
 
																+	/* The current node is not computing that task, so we receive the block
															
 
																+	 * with MPI */
															
 
																+
															
 
																+	/* We don't issue a MPI receive request until everyone using the
															
 
																+	 * temporary buffer is done : 12_(k-1)j can be used by 22_(k-1)ij with
															
 
																+	 * i >= k */
															
 
																+	unsigned ndeps = 0;
															
 
																+	starpu_tag_t tag_array[nblocks];
															
 
																+	
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+	if (k > 0)
															
 
																+	for (i = (k-1)+1; i < nblocks; i++)
															
 
																+#else
															
 
																+	if (k > 1)
															
 
																+	for (i = (k-2)+1; i < nblocks; i++)
															
 
																+#endif
															
 
																+	{
															
 
																+		if (rank == get_block_rank(i, j))
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+			tag_array[ndeps++] = TAG22(k-1, i, j);
															
 
																+#else
															
 
																+			tag_array[ndeps++] = TAG22(k-2, i, j);
															
 
																+#endif
															
 
																+	}
															
 
																+	
															
 
																+	int source = get_block_rank(k, j);
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
															
 
																+#else
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j,k);
															
 
																+#endif
															
 
																+	int mpi_tag = MPI_TAG12(k, j);
															
 
																+	starpu_tag_t partial_tag = TAG12_SAVE_PARTIAL(k, j);
															
 
																+	starpu_tag_t unlocked_tag = TAG12_SAVE(k, j);
															
 
																+
															
 
																+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
															
 
																+}
															
 
																+
															
 
																+static void find_nodes_using_12(unsigned k, unsigned j, int *rank_mask)
															
 
																+{
															
 
																+	memset(rank_mask, 0, world_size*sizeof(int));
															
 
																+
															
 
																+	/* Block 12_kj is used to compute 22_kij with i > k */
															
 
																+	unsigned i;
															
 
																+	for (i = k+1; i < nblocks; i++)
															
 
																+	{
															
 
																+		int r = get_block_rank(i, j);
															
 
																+		rank_mask[r] = 1;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void callback_task_12_real(void *_arg)
															
 
																+{
															
 
																+	struct callback_arg *arg = _arg;
															
 
																+
															
 
																+	unsigned k = arg->k;
															
 
																+	unsigned j = arg->j;
															
 
																+
															
 
																+	/* Find all the nodes potentially requiring this block */
															
 
																+	int rank_mask[world_size];
															
 
																+	find_nodes_using_12(k, j, rank_mask);
															
 
																+	rank_mask[rank] = 0;
															
 
																+
															
 
																+	/* Send the block to those nodes */
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, j);
															
 
																+	starpu_tag_t tag = TAG12_SAVE(k, j);
															
 
																+	int mpi_tag = MPI_TAG12(k, j);
															
 
																+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
															
 
																+	
															
 
																+	free(arg);
															
 
																+}
															
 
																+
															
 
																+static void create_task_12_real(unsigned k, unsigned j)
															
 
																+{
															
 
																+	struct starpu_task *task = create_task(TAG12(k, j));
															
 
																+	
															
 
																+#warning temporary fix :/
															
 
																+//	task->cl = &STARPU_PLU(cl12);
															
 
																+	task->cl = &STARPU_PLU(cl21);
															
 
																+
															
 
																+	task->cl_arg = create_debug_info(j, j, k);
															
 
																+
															
 
																+	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
															
 
																+
															
 
																+	starpu_tag_t tag_11_dep; 
															
 
																+
															
 
																+	/* which sub-data is manipulated ? */
															
 
																+	starpu_data_handle_t diag_block;
															
 
																+	if (diag_block_is_local)
															
 
																+	{
															
 
																+		diag_block = STARPU_PLU(get_block_handle)(k, k);
															
 
																+		tag_11_dep = TAG11(k);
															
 
																+	}
															
 
																+	else 
															
 
																+	{
															
 
																+#ifdef SINGLE_TMP11
															
 
																+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
															
 
																+#else
															
 
																+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
															
 
																+#endif
															
 
																+		tag_11_dep = TAG11_SAVE(k);
															
 
																+	}
															
 
																+
															
 
																+	task->handles[0] = diag_block; 
															
 
																+	task->handles[1] = STARPU_PLU(get_block_handle)(k, j); 
															
 
																+
															
 
																+	STARPU_ASSERT(get_block_rank(k, j) == rank);
															
 
																+
															
 
																+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
															
 
																+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
															
 
																+
															
 
																+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
															
 
																+		arg->j = j;
															
 
																+		arg->k = k;
															
 
																+
															
 
																+	task->callback_func = callback_task_12_real;
															
 
																+	task->callback_arg = arg;
															
 
																+
															
 
																+	if (!no_prio && (j == k+1)) {
															
 
																+		task->priority = STARPU_MAX_PRIO;
															
 
																+	}
															
 
																+
															
 
																+	/* enforce dependencies ... */
															
 
																+	if (k > 0) {
															
 
																+		starpu_tag_declare_deps(TAG12(k, j), 2, tag_11_dep, TAG22(k-1, k, j));
															
 
																+	}
															
 
																+	else {
															
 
																+		starpu_tag_declare_deps(TAG12(k, j), 1, tag_11_dep);
															
 
																+	}
															
 
																+
															
 
																+	int ret = starpu_task_submit(task);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+}
															
 
																+
															
 
																+static void create_task_12(unsigned k, unsigned j)
															
 
																+{
															
 
																+	if (get_block_rank(k, j) == rank)
															
 
																+	{
															
 
																+#ifdef VERBOSE_INIT
															
 
																+		fprintf(stderr, "CREATE real task 12(k = %d, j = %d) on node %d\n", k, j, rank);
															
 
																+#endif
															
 
																+		create_task_12_real(k, j);
															
 
																+	}
															
 
																+	else {
															
 
																+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
															
 
																+		int rank_mask[world_size];
															
 
																+		find_nodes_using_12(k, j, rank_mask);
															
 
																+		
															
 
																+		if (rank_mask[rank])
															
 
																+		{
															
 
																+#ifdef VERBOSE_INIT
															
 
																+			fprintf(stderr, "create RECV task 12(k = %d, j = %d) on node %d\n", k, j, rank);
															
 
																+#endif
															
 
																+			create_task_12_recv(k, j);
															
 
																+		}
															
 
																+		else {
															
 
																+#ifdef VERBOSE_INIT
															
 
																+			fprintf(stderr, "Node %d needs not 12(k=%d, i=%d)\n", rank, k, j);
															
 
																+#endif
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Task 21 (Update upper right (TRSM))
															
 
																+ */
															
 
																+
															
 
																+static void create_task_21_recv(unsigned k, unsigned i)
															
 
																+{
															
 
																+	unsigned j;
															
 
																+
															
 
																+	/* The current node is not computing that task, so we receive the block
															
 
																+	 * with MPI */
															
 
																+
															
 
																+	/* We don't issue a MPI receive request until everyone using the
															
 
																+	 * temporary buffer is done : 21_(k-1)i can be used by 22_(k-1)ij with
															
 
																+	 * j >= k */
															
 
																+	unsigned ndeps = 0;
															
 
																+	starpu_tag_t tag_array[nblocks];
															
 
																+	
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+	if (k > 0)
															
 
																+	for (j = (k-1)+1; j < nblocks; j++)
															
 
																+#else
															
 
																+	if (k > 1)
															
 
																+	for (j = (k-2)+1; j < nblocks; j++)
															
 
																+#endif
															
 
																+	{
															
 
																+		if (rank == get_block_rank(i, j))
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+			tag_array[ndeps++] = TAG22(k-1, i, j);
															
 
																+#else
															
 
																+			tag_array[ndeps++] = TAG22(k-2, i, j);
															
 
																+#endif
															
 
																+	}
															
 
																+
															
 
																+	int source = get_block_rank(i, k);
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i);
															
 
																+#else
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i, k);
															
 
																+#endif
															
 
																+	int mpi_tag = MPI_TAG21(k, i);
															
 
																+	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
															
 
																+	starpu_tag_t unlocked_tag = TAG21_SAVE(k, i);
															
 
																+
															
 
																+//	fprintf(stderr, "NODE %d - 21 (%d, %d) - recv when done ndeps %d - tag array %lx\n", rank, k, i, ndeps, tag_array[0]);
															
 
																+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
															
 
																+}
															
 
																+
															
 
																+static void find_nodes_using_21(unsigned k, unsigned i, int *rank_mask)
															
 
																+{
															
 
																+	memset(rank_mask, 0, world_size*sizeof(int));
															
 
																+
															
 
																+	/* Block 21_ki is used to compute 22_kij with j > k */
															
 
																+	unsigned j;
															
 
																+	for (j = k+1; j < nblocks; j++)
															
 
																+	{
															
 
																+		int r = get_block_rank(i, j);
															
 
																+		rank_mask[r] = 1;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void callback_task_21_real(void *_arg)
															
 
																+{
															
 
																+	struct callback_arg *arg = _arg;
															
 
																+
															
 
																+	unsigned k = arg->k;
															
 
																+	unsigned i = arg->i;
															
 
																+
															
 
																+	/* Find all the nodes potentially requiring this block */
															
 
																+	int rank_mask[world_size];
															
 
																+	find_nodes_using_21(k, i, rank_mask);
															
 
																+	rank_mask[rank] = 0;
															
 
																+
															
 
																+	/* Send the block to those nodes */
															
 
																+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(i, k);
															
 
																+	starpu_tag_t tag = TAG21_SAVE(k, i);
															
 
																+	int mpi_tag = MPI_TAG21(k, i);
															
 
																+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
															
 
																+	
															
 
																+	free(arg);
															
 
																+}
															
 
																+
															
 
																+static void create_task_21_real(unsigned k, unsigned i)
															
 
																+{
															
 
																+	struct starpu_task *task = create_task(TAG21(k, i));
															
 
																+
															
 
																+#warning temporary fix 
															
 
																+//	task->cl = &STARPU_PLU(cl21);
															
 
																+	task->cl = &STARPU_PLU(cl12);
															
 
																+
															
 
																+	task->cl_arg = create_debug_info(i, i, k);
															
 
																+
															
 
																+	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
															
 
																+
															
 
																+	starpu_tag_t tag_11_dep; 
															
 
																+	
															
 
																+	/* which sub-data is manipulated ? */
															
 
																+	starpu_data_handle_t diag_block;
															
 
																+	if (diag_block_is_local)
															
 
																+	{
															
 
																+		diag_block = STARPU_PLU(get_block_handle)(k, k);
															
 
																+		tag_11_dep = TAG11(k);
															
 
																+	}
															
 
																+	else 
															
 
																+	{
															
 
																+#ifdef SINGLE_TMP11
															
 
																+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
															
 
																+#else
															
 
																+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
															
 
																+#endif
															
 
																+		tag_11_dep = TAG11_SAVE(k);
															
 
																+	}
															
 
																+
															
 
																+	task->handles[0] = diag_block; 
															
 
																+	task->handles[1] = STARPU_PLU(get_block_handle)(i, k);
															
 
																+
															
 
																+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
															
 
																+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
															
 
																+
															
 
																+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
															
 
																+		arg->i = i;
															
 
																+		arg->k = k;
															
 
																+
															
 
																+	task->callback_func = callback_task_21_real;
															
 
																+	task->callback_arg = arg;
															
 
																+
															
 
																+	if (!no_prio && (i == k+1)) {
															
 
																+		task->priority = STARPU_MAX_PRIO;
															
 
																+	}
															
 
																+
															
 
																+	/* enforce dependencies ... */
															
 
																+	if (k > 0) {
															
 
																+		starpu_tag_declare_deps(TAG21(k, i), 2, tag_11_dep, TAG22(k-1, i, k));
															
 
																+	}
															
 
																+	else {
															
 
																+		starpu_tag_declare_deps(TAG21(k, i), 1, tag_11_dep);
															
 
																+	}
															
 
																+
															
 
																+	int ret = starpu_task_submit(task);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+}
															
 
																+
															
 
																+static void create_task_21(unsigned k, unsigned i)
															
 
																+{
															
 
																+	if (get_block_rank(i, k) == rank)
															
 
																+	{
															
 
																+#ifdef VERBOSE_INIT
															
 
																+		fprintf(stderr, "CREATE real task 21(k = %d, i = %d) on node %d\n", k, i, rank);
															
 
																+#endif
															
 
																+		create_task_21_real(k, i);
															
 
																+	}
															
 
																+	else {
															
 
																+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
															
 
																+		int rank_mask[world_size];
															
 
																+		find_nodes_using_21(k, i, rank_mask);
															
 
																+		
															
 
																+		if (rank_mask[rank])
															
 
																+		{
															
 
																+#ifdef VERBOSE_INIT
															
 
																+			fprintf(stderr, "create RECV task 21(k = %d, i = %d) on node %d\n", k, i, rank);
															
 
																+#endif
															
 
																+			create_task_21_recv(k, i);
															
 
																+		}
															
 
																+		else {
															
 
																+#ifdef VERBOSE_INIT
															
 
																+			fprintf(stderr, "Node %d needs not 21(k=%d, i=%d)\n", rank, k,i);
															
 
																+#endif
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Task 22 (GEMM)
															
 
																+ */
															
 
																+
															
 
																+static void create_task_22_real(unsigned k, unsigned i, unsigned j)
															
 
																+{
															
 
																+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
															
 
																+
															
 
																+	struct starpu_task *task = create_task(TAG22(k, i, j));
															
 
																+
															
 
																+	task->cl = &STARPU_PLU(cl22);
															
 
																+
															
 
																+	task->cl_arg = create_debug_info(i, j, k);
															
 
																+
															
 
																+	/* which sub-data is manipulated ? */
															
 
																+
															
 
																+	/* produced by TAG21_SAVE(k, i) */ 
															
 
																+	unsigned block21_is_local = (get_block_rank(i, k) == rank);
															
 
																+	starpu_tag_t tag_21_dep;
															
 
																+
															
 
																+	starpu_data_handle_t block21;
															
 
																+	if (block21_is_local)
															
 
																+	{
															
 
																+		block21 = STARPU_PLU(get_block_handle)(i, k);
															
 
																+		tag_21_dep = TAG21(k, i);
															
 
																+	}
															
 
																+	else 
															
 
																+	{
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
															
 
																+#else
															
 
																+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i, k);
															
 
																+#endif
															
 
																+		tag_21_dep = TAG21_SAVE(k, i);
															
 
																+	}
															
 
																+
															
 
																+	/* produced by TAG12_SAVE(k, j) */
															
 
																+	unsigned block12_is_local = (get_block_rank(k, j) == rank);
															
 
																+	starpu_tag_t tag_12_dep;
															
 
																+
															
 
																+	starpu_data_handle_t block12;
															
 
																+	if (block12_is_local)
															
 
																+	{
															
 
																+	//	block12 = STARPU_PLU(get_block_handle)(j, k);
															
 
																+		block12 = STARPU_PLU(get_block_handle)(k, j);
															
 
																+		tag_12_dep = TAG12(k, j);
															
 
																+	}
															
 
																+	else 
															
 
																+	{
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
															
 
																+#else
															
 
																+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j, k);
															
 
																+#endif
															
 
																+		tag_12_dep = TAG12_SAVE(k, j);
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+
															
 
																+#warning temporary fix :/
															
 
																+	//task->handles[0] = block21;
															
 
																+	task->handles[0] = block12;
															
 
																+
															
 
																+	//task->handles[1] = block12;
															
 
																+	task->handles[1] = block21;
															
 
																+
															
 
																+	/* produced by TAG22(k-1, i, j) */
															
 
																+	task->handles[2] = STARPU_PLU(get_block_handle)(i, j);
															
 
																+
															
 
																+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
															
 
																+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
															
 
																+	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
															
 
																+
															
 
																+	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
															
 
																+		task->priority = STARPU_MAX_PRIO;
															
 
																+	}
															
 
																+
															
 
																+	/* enforce dependencies ... */
															
 
																+	if (k > 0) {
															
 
																+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), tag_12_dep, tag_21_dep);
															
 
																+	}
															
 
																+	else {
															
 
																+		starpu_tag_declare_deps(TAG22(k, i, j), 2, tag_12_dep, tag_21_dep);
															
 
																+	}
															
 
																+
															
 
																+	int ret = starpu_task_submit(task);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+}
															
 
																+
															
 
																+static void create_task_22(unsigned k, unsigned i, unsigned j)
															
 
																+{
															
 
																+	if (get_block_rank(i, j) == rank)
															
 
																+	{
															
 
																+	//	fprintf(stderr, "CREATE real task 22(k = %d, i = %d, j = %d) on node %d\n", k, i, j, rank);
															
 
																+		create_task_22_real(k, i, j);
															
 
																+	}
															
 
																+//	else {
															
 
																+//		fprintf(stderr, "Node %d needs not 22(k=%d, i=%d, j = %d)\n", rank, k,i,j);
															
 
																+//	}
															
 
																+}
															
 
																+
															
 
																+static void wait_tag_and_fetch_handle(starpu_tag_t tag, starpu_data_handle_t handle)
															
 
																+{
															
 
																+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
															
 
																+
															
 
																+	starpu_tag_wait(tag);
															
 
																+//	fprintf(stderr, "Rank %d : tag %lx is done\n", rank, tag);
															
 
																+
															
 
																+	starpu_data_acquire(handle, STARPU_R);
															
 
																+
															
 
																+//	starpu_data_unregister(handle);
															
 
																+}
															
 
																+
															
 
																+static void wait_termination(void)
															
 
																+{
															
 
																+	unsigned k, i, j;
															
 
																+	for (k = 0; k < nblocks; k++)
															
 
																+	{
															
 
																+		/* Wait task 11k if needed */
															
 
																+		if (get_block_rank(k, k) == rank)
															
 
																+		{
															
 
																+			starpu_data_handle_t diag_block = STARPU_PLU(get_block_handle)(k, k);
															
 
																+			wait_tag_and_fetch_handle(TAG11_SAVE(k), diag_block);
															
 
																+		}
															
 
																+		
															
 
																+
															
 
																+		for (i = k + 1; i < nblocks; i++)
															
 
																+		{
															
 
																+			/* Wait task 21ki if needed */
															
 
																+			if (get_block_rank(i, k) == rank)
															
 
																+			{
															
 
																+				starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(i, k);
															
 
																+				//starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(k, i);
															
 
																+				//fprintf(stderr, "BLOCK21 i %d k %d -> handle %p\n", i, k, block21);
															
 
																+				wait_tag_and_fetch_handle(TAG21_SAVE(k, i), block21);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		for (j = k + 1; j < nblocks; j++)
															
 
																+		{
															
 
																+			/* Wait task 12kj if needed */
															
 
																+			if (get_block_rank(k, j) == rank)
															
 
																+			{
															
 
																+				//starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(j, k);
															
 
																+				starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(k, j);
															
 
																+				//fprintf(stderr, "BLOCK12 j %d k %d -> handle %p\n", j, k, block12);
															
 
																+				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
															
 
																+			}
															
 
																+		}
															
 
																+	}	
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	code to bootstrap the factorization 
															
 
																+ */
															
 
																+
															
 
																+double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
															
 
																+{
															
 
																+	struct timeval start;
															
 
																+	struct timeval end;
															
 
																+
															
 
																+	nblocks = _nblocks;
															
 
																+	rank = _rank;
															
 
																+	world_size = _world_size;
															
 
																+
															
 
																+	/* create all the DAG nodes */
															
 
																+	unsigned i,j,k;
															
 
																+
															
 
																+	for (k = 0; k < nblocks; k++)
															
 
																+	{
															
 
																+		create_task_11(k);
															
 
																+
															
 
																+		for (i = k+1; i<nblocks; i++)
															
 
																+		{
															
 
																+			create_task_12(k, i);
															
 
																+			create_task_21(k, i);
															
 
																+		}
															
 
																+
															
 
																+		for (i = k+1; i<nblocks; i++)
															
 
																+		{
															
 
																+			for (j = k+1; j<nblocks; j++)
															
 
																+			{
															
 
																+				create_task_22(k, i, j);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	/* schedule the codelet */
															
 
																+	gettimeofday(&start, NULL);
															
 
																+
															
 
																+	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
															
 
																+
															
 
																+	wait_termination();
															
 
																+	
															
 
																+	gettimeofday(&end, NULL);
															
 
																+
															
 
																+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	
															
 
																+//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
															
 
																+	
															
 
																+	return timing;
															
 
																+}
															
--- a/mpi/examples/mpi_lu/pxlu.h
+++ b/mpi/examples/mpi_lu/pxlu.h
@@ -0,0 +1,65 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __PXLU_H__
															
 
																+#define __PXLU_H__
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <common/blas.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+
															
 
																+#define BLAS3_FLOP(n1,n2,n3)    \
															
 
																+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
															
 
																+
															
 
																+//#define SINGLE_TMP11	1
															
 
																+//#define SINGLE_TMP1221	1
															
 
																+
															
 
																+struct debug_info {
															
 
																+	unsigned i;
															
 
																+	unsigned j;
															
 
																+	unsigned k;
															
 
																+};
															
 
																+
															
 
																+double STARPU_PLU(plu_main)(unsigned nblocks, int rank, int world_size);
															
 
																+
															
 
																+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks);
															
 
																+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved);
															
 
																+
															
 
																+unsigned STARPU_PLU(display_flag)(void);
															
 
																+
															
 
																+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
															
 
																+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
															
 
																+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j);
															
 
																+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j);
															
 
																+#ifdef SINGLE_TMP11
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void);
															
 
																+#else
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k);
															
 
																+#endif
															
 
																+#ifdef SINGLE_TMP1221
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
															
 
																+#else
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k);
															
 
																+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k);
															
 
																+#endif
															
 
																+
															
 
																+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize);
															
 
																+
															
 
																+int get_block_rank(unsigned i, unsigned j);
															
 
																+
															
 
																+#endif // __PXLU_H__
															
--- a/mpi/examples/mpi_lu/pxlu_kernels.c
+++ b/mpi/examples/mpi_lu/pxlu_kernels.c
@@ -0,0 +1,444 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "pxlu.h"
															
 
																+#include "pxlu_kernels.h"
															
 
																+#include <math.h>
															
 
																+
															
 
																+///#define VERBOSE_KERNELS	1
															
 
																+
															
 
																+/*
															
 
																+ *   U22 
															
 
																+ */
															
 
																+
															
 
																+static inline void STARPU_PLU(common_u22)(void *descr[],
															
 
																+				int s, __attribute__((unused)) void *_args)
															
 
																+{
															
 
																+	TYPE *right 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	TYPE *left 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+	TYPE *center 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
															
 
																+
															
 
																+	unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
															
 
																+	unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
															
 
																+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+
															
 
																+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
															
 
																+
															
 
																+#ifdef VERBOSE_KERNELS
															
 
																+	struct debug_info *info = _args;
															
 
																+
															
 
																+	int rank;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d\n", rank, info->k, info->i, info->j);
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cublasStatus status;
															
 
																+	cudaError_t cures;
															
 
																+#endif
															
 
																+
															
 
																+	switch (s) {
															
 
																+		case 0:
															
 
																+			CPU_GEMM("N", "N", dy, dx, dz, 
															
 
																+				(TYPE)-1.0, right, ld21, left, ld12,
															
 
																+				(TYPE)1.0, center, ld22);
															
 
																+			break;
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+		case 1:
															
 
																+			CUBLAS_GEMM('n', 'n', dx, dy, dz,
															
 
																+				(TYPE)-1.0, right, ld21, left, ld12,
															
 
																+				(TYPE)1.0f, center, ld22);
															
 
																+
															
 
																+			status = cublasGetError();
															
 
																+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
															
 
																+				STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																+
															
 
																+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
															
 
																+				STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+
															
 
																+			break;
															
 
																+#endif
															
 
																+		default:
															
 
																+			STARPU_ABORT();
															
 
																+			break;
															
 
																+	}
															
 
																+#ifdef VERBOSE_KERNELS
															
 
																+	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d done\n", rank, info->k, info->i, info->j);
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+static void STARPU_PLU(cpu_u22)(void *descr[], void *_args)
															
 
																+{
															
 
																+	STARPU_PLU(common_u22)(descr, 0, _args);
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
															
 
																+{
															
 
																+	STARPU_PLU(common_u22)(descr, 1, _args);
															
 
																+}
															
 
																+#endif// STARPU_USE_CUDA
															
 
																+
															
 
																+static struct starpu_perfmodel STARPU_PLU(model_22) = {
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+#ifdef STARPU_ATLAS
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
															
 
																+#elif defined(STARPU_GOTO)
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_22_goto)
															
 
																+#else
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_22)
															
 
																+#endif
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet STARPU_PLU(cl22) = {
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.cpu_funcs = {STARPU_PLU(cpu_u22), NULL},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {STARPU_PLU(cublas_u22), NULL},
															
 
																+#endif
															
 
																+	.nbuffers = 3,
															
 
																+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
															
 
																+	.model = &STARPU_PLU(model_22)
															
 
																+};
															
 
																+
															
 
																+
															
 
																+/*
															
 
																+ * U12
															
 
																+ */
															
 
																+
															
 
																+static inline void STARPU_PLU(common_u12)(void *descr[],
															
 
																+				int s, __attribute__((unused)) void *_args)
															
 
																+{
															
 
																+	TYPE *sub11;
															
 
																+	TYPE *sub12;
															
 
																+
															
 
																+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);	
															
 
																+	sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+
															
 
																+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+
															
 
																+	unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
															
 
																+	unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
															
 
																+
															
 
																+#ifdef VERBOSE_KERNELS
															
 
																+	struct debug_info *info = _args;
															
 
																+
															
 
																+	int rank;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+#warning fixed debugging according to other tweak
															
 
																+	//fprintf(stderr, "KERNEL 12 %d - k = %d i %d\n", rank, info->k, info->i);
															
 
																+	fprintf(stderr, "KERNEL 21 %d - k = %d i %d\n", rank, info->k, info->j);
															
 
																+
															
 
																+	//fprintf(stderr, "INPUT 12 U11\n");
															
 
																+	fprintf(stderr, "INPUT 21 U11\n");
															
 
																+	STARPU_PLU(display_data_content)(sub11, nx12);
															
 
																+	//fprintf(stderr, "INPUT 12 U12\n");
															
 
																+	fprintf(stderr, "INPUT 21 U21\n");
															
 
																+	STARPU_PLU(display_data_content)(sub12, nx12);
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cublasStatus status;
															
 
																+	cudaError_t cures;
															
 
																+#endif
															
 
																+
															
 
																+	/* solve L11 U12 = A12 (find U12) */
															
 
																+	switch (s) {
															
 
																+		case 0:
															
 
																+			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
															
 
																+					(TYPE)1.0, sub11, ld11, sub12, ld12);
															
 
																+			break;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+		case 1:
															
 
																+			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
															
 
																+					(TYPE)1.0, sub11, ld11, sub12, ld12);
															
 
																+
															
 
																+			status = cublasGetError();
															
 
																+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
															
 
																+				STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																+
															
 
																+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
															
 
																+				STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+
															
 
																+			break;
															
 
																+#endif
															
 
																+		default:
															
 
																+			STARPU_ABORT();
															
 
																+			break;
															
 
																+	}
															
 
																+
															
 
																+#ifdef VERBOSE_KERNELS
															
 
																+	//fprintf(stderr, "OUTPUT 12 U12\n");
															
 
																+	fprintf(stderr, "OUTPUT 21 U21\n");
															
 
																+	STARPU_PLU(display_data_content)(sub12, nx12);
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+static void STARPU_PLU(cpu_u12)(void *descr[], void *_args)
															
 
																+{
															
 
																+	STARPU_PLU(common_u12)(descr, 0, _args);
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
															
 
																+{
															
 
																+	STARPU_PLU(common_u12)(descr, 1, _args);
															
 
																+}
															
 
																+#endif // STARPU_USE_CUDA
															
 
																+
															
 
																+static struct starpu_perfmodel STARPU_PLU(model_12) = {
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+#ifdef STARPU_ATLAS
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
															
 
																+#elif defined(STARPU_GOTO)
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_12_goto)
															
 
																+#else
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_12)
															
 
																+#endif
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet STARPU_PLU(cl12) = {
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.cpu_funcs = {STARPU_PLU(cpu_u12), NULL},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {STARPU_PLU(cublas_u12), NULL},
															
 
																+#endif
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = {STARPU_R, STARPU_RW},
															
 
																+	.model = &STARPU_PLU(model_12)
															
 
																+};
															
 
																+
															
 
																+
															
 
																+/* 
															
 
																+ * U21
															
 
																+ */
															
 
																+
															
 
																+static inline void STARPU_PLU(common_u21)(void *descr[],
															
 
																+				int s, __attribute__((unused)) void *_args)
															
 
																+{
															
 
																+	TYPE *sub11;
															
 
																+	TYPE *sub21;
															
 
																+
															
 
																+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	sub21 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																+
															
 
																+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
															
 
																+
															
 
																+	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
															
 
																+	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
															
 
																+	
															
 
																+#ifdef VERBOSE_KERNELS
															
 
																+	struct debug_info *info = _args;
															
 
																+
															
 
																+	int rank;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+#warning fixed debugging according to other tweak
															
 
																+	//fprintf(stderr, "KERNEL 21 %d (k = %d, i = %d)\n", rank, info->k, info->i);
															
 
																+	fprintf(stderr, "KERNEL 12 %d (k = %d, j = %d)\n", rank, info->k, info->j);
															
 
																+
															
 
																+	//fprintf(stderr, "INPUT 21 U11\n");
															
 
																+	fprintf(stderr, "INPUT 12 U11\n");
															
 
																+	STARPU_PLU(display_data_content)(sub11, nx21);
															
 
																+	//fprintf(stderr, "INPUT 21 U21\n");
															
 
																+	fprintf(stderr, "INPUT 12 U12\n");
															
 
																+	STARPU_PLU(display_data_content)(sub21, nx21);
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	cublasStatus status;
															
 
																+#endif
															
 
																+
															
 
																+
															
 
																+	switch (s) {
															
 
																+		case 0:
															
 
																+			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
															
 
																+					(TYPE)1.0, sub11, ld11, sub21, ld21);
															
 
																+			break;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+		case 1:
															
 
																+			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
															
 
																+					(TYPE)1.0, sub11, ld11, sub21, ld21);
															
 
																+
															
 
																+			status = cublasGetError();
															
 
																+			if (status != CUBLAS_STATUS_SUCCESS)
															
 
																+				STARPU_CUBLAS_REPORT_ERROR(status);
															
 
																+
															
 
																+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+
															
 
																+			break;
															
 
																+#endif
															
 
																+		default:
															
 
																+			STARPU_ABORT();
															
 
																+			break;
															
 
																+	}
															
 
																+
															
 
																+#ifdef VERBOSE_KERNELS
															
 
																+	//fprintf(stderr, "OUTPUT 21 U11\n");
															
 
																+	fprintf(stderr, "OUTPUT 12 U11\n");
															
 
																+	STARPU_PLU(display_data_content)(sub11, nx21);
															
 
																+	//fprintf(stderr, "OUTPUT 21 U21\n");
															
 
																+	fprintf(stderr, "OUTPUT 12 U12\n");
															
 
																+	STARPU_PLU(display_data_content)(sub21, nx21);
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+static void STARPU_PLU(cpu_u21)(void *descr[], void *_args)
															
 
																+{
															
 
																+	STARPU_PLU(common_u21)(descr, 0, _args);
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
															
 
																+{
															
 
																+	STARPU_PLU(common_u21)(descr, 1, _args);
															
 
																+}
															
 
																+#endif 
															
 
																+
															
 
																+static struct starpu_perfmodel STARPU_PLU(model_21) = {
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+#ifdef STARPU_ATLAS
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
															
 
																+#elif defined(STARPU_GOTO)
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_21_goto)
															
 
																+#else
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_21)
															
 
																+#endif
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet STARPU_PLU(cl21) = {
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.cpu_funcs = {STARPU_PLU(cpu_u21), NULL},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {STARPU_PLU(cublas_u21), NULL},
															
 
																+#endif
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = {STARPU_R, STARPU_RW},
															
 
																+	.model = &STARPU_PLU(model_21)
															
 
																+};
															
 
																+
															
 
																+
															
 
																+/*
															
 
																+ *	U11
															
 
																+ */
															
 
																+
															
 
																+static inline void STARPU_PLU(common_u11)(void *descr[],
															
 
																+				int s, __attribute__((unused)) void *_args)
															
 
																+{
															
 
																+	TYPE *sub11;
															
 
																+
															
 
																+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]); 
															
 
																+
															
 
																+	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
															
 
																+	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+
															
 
																+	unsigned long z;
															
 
																+
															
 
																+#ifdef VERBOSE_KERNELS
															
 
																+	struct debug_info *info = _args;
															
 
																+
															
 
																+	int rank;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
															
 
																+#endif
															
 
																+
															
 
																+	switch (s) {
															
 
																+		case 0:
															
 
																+			for (z = 0; z < nx; z++)
															
 
																+			{
															
 
																+				TYPE pivot;
															
 
																+				pivot = sub11[z+z*ld];
															
 
																+				STARPU_ASSERT(pivot != 0.0);
															
 
																+		
															
 
																+				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
															
 
																+		
															
 
																+				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
															
 
																+						&sub11[(z+1)+z*ld], 1,
															
 
																+						&sub11[z+(z+1)*ld], ld,
															
 
																+						&sub11[(z+1) + (z+1)*ld],ld);
															
 
																+			}
															
 
																+			break;
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+		case 1:
															
 
																+			for (z = 0; z < nx; z++)
															
 
																+			{
															
 
																+				TYPE pivot;
															
 
																+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+
															
 
																+				STARPU_ASSERT(pivot != 0.0);
															
 
																+				
															
 
																+				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
															
 
																+				
															
 
																+				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
															
 
																+						&sub11[(z+1)+z*ld], 1,
															
 
																+						&sub11[z+(z+1)*ld], ld,
															
 
																+						&sub11[(z+1) + (z+1)*ld],ld);
															
 
																+			}
															
 
																+			
															
 
																+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+
															
 
																+			break;
															
 
																+#endif
															
 
																+		default:
															
 
																+			STARPU_ABORT();
															
 
																+			break;
															
 
																+	}
															
 
																+#ifdef VERBOSE_KERNELS
															
 
																+	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+static void STARPU_PLU(cpu_u11)(void *descr[], void *_args)
															
 
																+{
															
 
																+	STARPU_PLU(common_u11)(descr, 0, _args);
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
															
 
																+{
															
 
																+	STARPU_PLU(common_u11)(descr, 1, _args);
															
 
																+}
															
 
																+#endif// STARPU_USE_CUDA
															
 
																+
															
 
																+static struct starpu_perfmodel STARPU_PLU(model_11) = {
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+#ifdef STARPU_ATLAS
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
															
 
																+#elif defined(STARPU_GOTO)
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_11_goto)
															
 
																+#else
															
 
																+	.symbol = STARPU_PLU_STR(lu_model_11)
															
 
																+#endif
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet STARPU_PLU(cl11) = {
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.cpu_funcs = {STARPU_PLU(cpu_u11), NULL},
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {STARPU_PLU(cublas_u11), NULL},
															
 
																+#endif
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																+	.model = &STARPU_PLU(model_11)
															
 
																+};
															
 
																+
															
 
																+
															
--- a/mpi/examples/mpi_lu/pxlu_kernels.h
+++ b/mpi/examples/mpi_lu/pxlu_kernels.h
@@ -0,0 +1,32 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __PXLU_KERNELS_H__
															
 
																+#define __PXLU_KERNELS_H__
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#define str(s) #s
															
 
																+#define xstr(s)        str(s)
															
 
																+#define STARPU_PLU_STR(name)  xstr(STARPU_PLU(name))
															
 
																+
															
 
																+struct starpu_codelet STARPU_PLU(cl11);
															
 
																+struct starpu_codelet STARPU_PLU(cl12);
															
 
																+struct starpu_codelet STARPU_PLU(cl21);
															
 
																+struct starpu_codelet STARPU_PLU(cl22);
															
 
																+
															
 
																+#endif // __PXLU_KERNELS_H__
															
--- a/mpi/examples/mpi_lu/slu_kernels.c
+++ b/mpi/examples/mpi_lu/slu_kernels.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "mpi_lu-float.h"
															
 
																+#include "xlu_kernels.c"
															
--- a/mpi/examples/perf.sh
+++ b/mpi/examples/perf.sh
@@ -0,0 +1,106 @@
 
																+#!/bin/bash
															
 
																+
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+# 
															
 
																+# Copyright (C) 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+# 
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+# 
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+# 
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+# 4G x np = 4 * (k*1K) ^ 2
															
 
																+# A G * np = 4 * k^2 * 1M
															
 
																+# A * 250 * np = k^2
															
 
																+# A = 6
															
 
																+# k = sqrt(1500*np)
															
 
																+# np = 1 => k = 32
															
 
																+# np = 2 => k = 48
															
 
																+# np = 3 => k = 64 
															
 
																+# np = 4 => k = 64
															
 
																+
															
 
																+# Problem size
															
 
																+NBLOCKS=16
															
 
																+BLOCKSIZE=1024
															
 
																+SIZE=$(($NBLOCKS*$BLOCKSIZE))
															
 
																+
															
 
																+echo "JOB ID ${PBS_JOBID}"
															
 
																+
															
 
																+nnodes=$(cat machinefile.${PBS_JOBID}|wc -l)
															
 
																+echo "got $nnodes mpi nodes"
															
 
																+
															
 
																+# Calibrate
															
 
																+ncalibrate=0
															
 
																+for i in `seq 1 $ncalibrate`
															
 
																+do
															
 
																+echo "STARPU_CALIBRATE $i/$ncalibrate"
															
 
																+STARPU_CALIBRATE=1 STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $nnodes ./mpi_lu/plu_example_float -p 2 -q 2 -nblocks 32 -size $((32*$BLOCKSIZE)) -numa
															
 
																+done
															
 
																+
															
 
																+func()
															
 
																+{
															
 
																+ngpus=$1
															
 
																+np=$2
															
 
																+p=$3
															
 
																+q=$4
															
 
																+nblocks=$5
															
 
																+
															
 
																+echo "*******************************************"> log
															
 
																+echo "*************** NGPUS $ngpus - np $np - nblocks $nblocks **************">> log
															
 
																+echo "*******************************************">> log
															
 
																+cat log
															
 
																+cat log >> log.all
															
 
																+
															
 
																+STARPU_NCPUS=0 STARPU_NCUDA=$ngpus STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $np ./mpi_lu/plu_example_float -p $p -q $q -nblocks $nblocks -size $(($nblocks * $BLOCKSIZE)) -numa > log.out 2> log.err
															
 
																+cat log.out > log
															
 
																+cat log.err >> log
															
 
																+cat log
															
 
																+cat log >> log.all
															
 
																+}
															
 
																+
															
 
																+rm -f log.all
															
 
																+
															
 
																+#how many time do we repeat each experiment ?
															
 
																+nloops=3
															
 
																+
															
 
																+per_node_max_memory=7000
															
 
																+
															
 
																+for np in 1 2 4
															
 
																+do
															
 
																+	for nblocks in 16 32 48 64 80
															
 
																+	do
															
 
																+		for ngpus_per_node in 1 2 3 4
															
 
																+		do
															
 
																+			for loop in `seq 1 $nloops`
															
 
																+			do
															
 
																+				# Compute p and q from np
															
 
																+				case $np in
															
 
																+				  1) p=1; q=1;;
															
 
																+				  2) p=2; q=1;;
															
 
																+				  4) p=2; q=2;;
															
 
																+				  *) echo -n "does not support $np nodes yet";;
															
 
																+				esac
															
 
																+
															
 
																+				# Does the problem fit into memory ?
															
 
																+				matrix_size=$(($nblocks * $BLOCKSIZE))
															
 
																+				per_node_memory=$(($((4*$matrix_size*$matrix_size/(1024*1024))) / $np))
															
 
																+
															
 
																+				echo "NP $np P $p Q $q SIZE $per_node_memory NBLOCKS $nblocks"
															
 
																+
															
 
																+				if test $per_node_memory -ge $per_node_max_memory; then
															
 
																+						echo "Problem is too large !"
															
 
																+				else
															
 
																+					func $ngpus_per_node $np $p $q $nblocks
															
 
																+					echo "go !"
															
 
																+				fi
															
 
																+			done
															
 
																+		done
															
 
																+	done
															
 
																+done
															
--- a/mpi/examples/reduction/mpi_reduction.c
+++ b/mpi/examples/reduction/mpi_reduction.c
@@ -0,0 +1,156 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <math.h>
															
 
																+
															
 
																+extern void init_cpu_func(void *descr[], void *cl_arg);
															
 
																+extern void redux_cpu_func(void *descr[], void *cl_arg);
															
 
																+extern void dot_cpu_func(void *descr[], void *cl_arg);
															
 
																+
															
 
																+static struct starpu_codelet init_codelet =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {init_cpu_func, NULL},
															
 
																+	.nbuffers = 1,
															
 
																+	.name = "init_codelet"
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet redux_codelet =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {redux_cpu_func, NULL},
															
 
																+	.nbuffers = 2,
															
 
																+	.name = "redux_codelet"
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet dot_codelet =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {dot_cpu_func, NULL},
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = {STARPU_R, STARPU_REDUX},
															
 
																+	.name = "dot_codelet"
															
 
																+};
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int nb_nodes)
															
 
																+{
															
 
																+	return x % nb_nodes;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int my_rank, size, x, y;
															
 
																+        long int *vector;
															
 
																+	long int dot, sum=0;
															
 
																+        starpu_data_handle_t *handles;
															
 
																+	starpu_data_handle_t dot_handle;
															
 
																+
															
 
																+	int nb_elements, step;
															
 
																+
															
 
																+	int ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	starpu_mpi_initialize_extended(&my_rank, &size);
															
 
																+
															
 
																+	nb_elements = size*8000;
															
 
																+	step = 4;
															
 
																+
															
 
																+	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
															
 
																+        for(x = 0; x < nb_elements; x+=step)
															
 
																+	{
															
 
																+		int mpi_rank = my_distrib(x/step, size);
															
 
																+		if (mpi_rank == my_rank)
															
 
																+		{
															
 
																+			for(y=0 ; y<step ; y++)
															
 
																+			{
															
 
																+				vector[x+y] = x+y+1;
															
 
																+			}
															
 
																+		}
															
 
																+        }
															
 
																+	if (my_rank == 0) {
															
 
																+		dot = 14;
															
 
																+		sum = (nb_elements * (nb_elements + 1)) / 2;
															
 
																+		sum+= dot;
															
 
																+		starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(dot));
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		starpu_variable_data_register(&dot_handle, -1, (uintptr_t)NULL, sizeof(dot));
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
															
 
																+        for(x = 0; x < nb_elements; x+=step)
															
 
																+	{
															
 
																+		int mpi_rank = my_distrib(x/step, size);
															
 
																+		if (mpi_rank == my_rank)
															
 
																+		{
															
 
																+			/* Owning data */
															
 
																+			starpu_vector_data_register(&handles[x], 0, (uintptr_t)&(vector[x]), step, sizeof(vector[0]));
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_vector_data_register(&handles[x], -1, (uintptr_t)NULL, step, sizeof(vector[0]));
															
 
																+		}
															
 
																+		if (handles[x])
															
 
																+		{
															
 
																+			starpu_data_set_rank(handles[x], mpi_rank);
															
 
																+			starpu_data_set_tag(handles[x], x);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_data_set_rank(dot_handle, 0);
															
 
																+	starpu_data_set_tag(dot_handle, nb_elements+1);
															
 
																+	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
															
 
																+
															
 
																+	for (x = 0; x < nb_elements; x+=step)
															
 
																+	{
															
 
																+		starpu_mpi_insert_task(MPI_COMM_WORLD,
															
 
																+				       &dot_codelet,
															
 
																+				       STARPU_R, handles[x],
															
 
																+				       STARPU_REDUX, dot_handle,
															
 
																+				       0);
															
 
																+	}
															
 
																+	starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
															
 
																+
															
 
																+        fprintf(stderr, "Waiting ...\n");
															
 
																+        starpu_task_wait_for_all();
															
 
																+
															
 
																+        for(x = 0; x < nb_elements; x+=step)
															
 
																+	{
															
 
																+		if (handles[x]) starpu_data_unregister(handles[x]);
															
 
																+	}
															
 
																+	if (dot_handle)
															
 
																+	{
															
 
																+		starpu_data_unregister(dot_handle);
															
 
																+	}
															
 
																+	free(vector);
															
 
																+	free(handles);
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	if (my_rank == 0)
															
 
																+	{
															
 
																+                fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
															
 
																+                fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
															
 
																+		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
															
 
																+        }
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
--- a/mpi/examples/reduction/mpi_reduction_kernels.c
+++ b/mpi/examples/reduction/mpi_reduction_kernels.c
@@ -0,0 +1,66 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <mpi.h>
															
 
																+
															
 
																+#define _DISPLAY(fmt, args ...) do { \
															
 
																+		int _display_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_display_rank);	\
															
 
																+		fprintf(stderr, "[%d][%s] " fmt , _display_rank, __func__ ,##args); 	\
															
 
																+		fflush(stderr); } while(0)
															
 
																+
															
 
																+/*
															
 
																+ *	Codelet to create a neutral element
															
 
																+ */
															
 
																+void init_cpu_func(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	*dot = 0;
															
 
																+	_DISPLAY("Init dot\n");
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Codelet to perform the reduction of two elements
															
 
																+ */
															
 
																+void redux_cpu_func(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+	long int *dota = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	long int *dotb = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+
															
 
																+	*dota = *dota + *dotb;
															
 
																+	_DISPLAY("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Dot product codelet
															
 
																+ */
															
 
																+void dot_cpu_func(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																+
															
 
																+	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+
															
 
																+//	_DISPLAY("Before dot=%ld (adding %d elements...)\n", *dot, n);
															
 
																+	unsigned i;
															
 
																+	for (i = 0; i < n; i++)
															
 
																+	{
															
 
																+//		_DISPLAY("Adding %ld\n", local_x[i]);
															
 
																+		*dot += local_x[i];
															
 
																+	}
															
 
																+//	_DISPLAY("After dot=%ld\n", *dot);
															
 
																+}
															
 
																+
															
--- a/mpi/examples/scatter_gather/mpi_scatter_gather.c
+++ b/mpi/examples/scatter_gather/mpi_scatter_gather.c
@@ -0,0 +1,228 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int y, int nb_nodes)
															
 
																+{
															
 
																+        return (x+y) % nb_nodes;
															
 
																+}
															
 
																+
															
 
																+void cpu_codelet(void *descr[], void *_args)
															
 
																+{
															
 
																+	float *block;
															
 
																+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+	unsigned i,j;
															
 
																+	int rank;
															
 
																+	float factor;
															
 
																+
															
 
																+	block = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+        starpu_codelet_unpack_args(_args, &rank);
															
 
																+	factor = block[0];
															
 
																+
															
 
																+	//fprintf(stderr,"rank %d factor %f\n", rank, factor);
															
 
																+	for (j = 0; j < nx; j++)
															
 
																+	{
															
 
																+		for (i = 0; i < nx; i++)
															
 
																+		{
															
 
																+			//fprintf(stderr,"rank %d factor %f --> %f %f\n", rank, factor, block[j+i*ld], block[j+i*ld]*factor);
															
 
																+			block[j+i*ld] *= factor;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet cl =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {cpu_codelet, NULL},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW},
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int rank, nodes;
															
 
																+	float ***bmat = NULL;
															
 
																+        starpu_data_handle_t *data_handles;
															
 
																+
															
 
																+	unsigned i,j,x,y;
															
 
																+
															
 
																+	unsigned nblocks=4;
															
 
																+	unsigned block_size=2;
															
 
																+	unsigned size = nblocks*block_size;
															
 
																+	unsigned ld = size / nblocks;
															
 
																+
															
 
																+	int ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	starpu_mpi_initialize_extended(&rank, &nodes);
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		/* Allocate the matrix */
															
 
																+		int block_number=10;
															
 
																+		bmat = malloc(nblocks * sizeof(float *));
															
 
																+		for(x=0 ; x<nblocks ; x++)
															
 
																+		{
															
 
																+			bmat[x] = malloc(nblocks * sizeof(float *));
															
 
																+			for(y=0 ; y<nblocks ; y++)
															
 
																+			{
															
 
																+				float value=0.0;
															
 
																+				starpu_malloc((void **)&bmat[x][y], block_size*block_size*sizeof(float));
															
 
																+				for (i = 0; i < block_size; i++)
															
 
																+				{
															
 
																+					for (j = 0; j < block_size; j++)
															
 
																+					{
															
 
																+						bmat[x][y][j +i*block_size] = block_number + value;
															
 
																+						value++;
															
 
																+					}
															
 
																+				}
															
 
																+				block_number += 10;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+#if 0
															
 
																+	// Print matrix
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		fprintf(stderr, "Input matrix\n");
															
 
																+		for(x=0 ; x<nblocks ; x++)
															
 
																+		{
															
 
																+			for(y=0 ; y<nblocks ; y++)
															
 
																+			{
															
 
																+				for (j = 0; j < block_size; j++)
															
 
																+				{
															
 
																+					for (i = 0; i < block_size; i++)
															
 
																+					{
															
 
																+						fprintf(stderr, "%2.2f\t", bmat[x][y][j+i*block_size]);
															
 
																+					}
															
 
																+					fprintf(stderr,"\n");
															
 
																+				}
															
 
																+				fprintf(stderr,"\n");
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																+	/* Allocate data handles and register data to StarPU */
															
 
																+        data_handles = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t *));
															
 
																+        for(x = 0; x < nblocks ;  x++)
															
 
																+	{
															
 
																+                for (y = 0; y < nblocks; y++)
															
 
																+		{
															
 
																+			int mpi_rank = my_distrib(x, y, nodes);
															
 
																+			if (rank == 0)
															
 
																+			{
															
 
																+				starpu_matrix_data_register(&data_handles[x+y*nblocks], 0, (uintptr_t)bmat[x][y],
															
 
																+							    ld, size/nblocks, size/nblocks, sizeof(float));
															
 
																+			}
															
 
																+			else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1)))
															
 
																+			{
															
 
																+				/* I own that index, or i will need it for my computations */
															
 
																+				//fprintf(stderr, "[%d] Owning or neighbor of data[%d][%d]\n", rank, x, y);
															
 
																+				starpu_matrix_data_register(&data_handles[x+y*nblocks], -1, (uintptr_t)NULL,
															
 
																+							    ld, size/nblocks, size/nblocks, sizeof(float));
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				/* I know it's useless to allocate anything for this */
															
 
																+				data_handles[x+y*nblocks] = NULL;
															
 
																+			}
															
 
																+                        if (data_handles[x+y*nblocks])
															
 
																+			{
															
 
																+                                starpu_data_set_rank(data_handles[x+y*nblocks], mpi_rank);
															
 
																+                                starpu_data_set_tag(data_handles[x+y*nblocks], (y*nblocks)+x);
															
 
																+			}
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+	/* Scatter the matrix among the nodes */
															
 
																+	starpu_mpi_scatter_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD);
															
 
																+
															
 
																+	/* Calculation */
															
 
																+	for(x = 0; x < nblocks*nblocks ;  x++)
															
 
																+	{
															
 
																+		if (data_handles[x])
															
 
																+		{
															
 
																+			int owner = starpu_data_get_rank(data_handles[x]);
															
 
																+			if (owner == rank)
															
 
																+			{
															
 
																+				//fprintf(stderr,"[%d] Computing on data[%d]\n", rank, x);
															
 
																+				starpu_insert_task(&cl,
															
 
																+						   STARPU_VALUE, &rank, sizeof(rank),
															
 
																+						   STARPU_RW, data_handles[x],
															
 
																+						   0);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Gather the matrix on main node */
															
 
																+	starpu_mpi_gather_detached(data_handles, nblocks*nblocks, 0, MPI_COMM_WORLD);
															
 
																+
															
 
																+	/* Unregister matrix from StarPU */
															
 
																+	for(x=0 ; x<nblocks*nblocks ; x++)
															
 
																+	{
															
 
																+		if (data_handles[x])
															
 
																+		{
															
 
																+			starpu_data_unregister(data_handles[x]);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+#if 0
															
 
																+	// Print matrix
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		fprintf(stderr, "Output matrix\n");
															
 
																+		for(x=0 ; x<nblocks ; x++)
															
 
																+		{
															
 
																+			for(y=0 ; y<nblocks ; y++)
															
 
																+			{
															
 
																+				for (j = 0; j < block_size; j++)
															
 
																+				{
															
 
																+					for (i = 0; i < block_size; i++)
															
 
																+					{
															
 
																+						fprintf(stderr, "%2.2f\t", bmat[x][y][j+i*block_size]);
															
 
																+					}
															
 
																+					fprintf(stderr,"\n");
															
 
																+				}
															
 
																+				fprintf(stderr,"\n");
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																+	// Free memory
															
 
																+        free(data_handles);
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		for(x=0 ; x<nblocks ; x++)
															
 
																+		{
															
 
																+			for(y=0 ; y<nblocks ; y++)
															
 
																+			{
															
 
																+				starpu_free((void *)bmat[x][y]);
															
 
																+			}
															
 
																+			free(bmat[x]);
															
 
																+		}
															
 
																+		free(bmat);
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -0,0 +1,159 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <math.h>
															
 
																+
															
 
																+void stencil5_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	unsigned *xy = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	unsigned *xm1y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+	unsigned *xp1y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[2]);
															
 
																+	unsigned *xym1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[3]);
															
 
																+	unsigned *xyp1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[4]);
															
 
																+
															
 
																+        //        fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
															
 
																+        *xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet stencil5_cl =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {stencil5_cpu, NULL},
															
 
																+        .nbuffers = 5,
															
 
																+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
															
 
																+};
															
 
																+
															
 
																+#define NITER_DEF 500
															
 
																+#define X         20
															
 
																+#define Y         20
															
 
																+
															
 
																+int display = 0;
															
 
																+int niter = NITER_DEF;
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int y, int nb_nodes)
															
 
																+{
															
 
																+	/* Block distrib */
															
 
																+	return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++)
															
 
																+	{
															
 
																+		if (strcmp(argv[i], "-iter") == 0)
															
 
																+		{
															
 
																+			char *argptr;
															
 
																+			niter = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+		if (strcmp(argv[i], "-display") == 0)
															
 
																+		{
															
 
																+			display = 1;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int my_rank, size, x, y, loop;
															
 
																+        int value=0, mean=0;
															
 
																+        unsigned matrix[X][Y];
															
 
																+        starpu_data_handle_t data_handles[X][Y];
															
 
																+
															
 
																+	int ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	starpu_mpi_initialize_extended(&my_rank, &size);
															
 
																+        parse_args(argc, argv);
															
 
																+
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        matrix[x][y] = (my_rank+1)*10 + value;
															
 
																+                        value++;
															
 
																+                        mean += matrix[x][y];
															
 
																+                }
															
 
																+        }
															
 
																+        mean /= value;
															
 
																+
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        int mpi_rank = my_distrib(x, y, size);
															
 
																+                        if (mpi_rank == my_rank)
															
 
																+			{
															
 
																+                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
															
 
																+                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
															
 
																+                        }
															
 
																+			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
															
 
																+			      || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
															
 
																+			{
															
 
																+                                /* I don't own that index, but will need it for my computations */
															
 
																+                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
															
 
																+                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
															
 
																+                        }
															
 
																+                        else
															
 
																+			{
															
 
																+                                /* I know it's useless to allocate anything for this */
															
 
																+                                data_handles[x][y] = NULL;
															
 
																+                        }
															
 
																+                        if (data_handles[x][y])
															
 
																+			{
															
 
																+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																+                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
															
 
																+			}
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+        for(loop=0 ; loop<niter; loop++)
															
 
																+	{
															
 
																+                for (x = 1; x < X-1; x++)
															
 
																+		{
															
 
																+                        for (y = 1; y < Y-1; y++)
															
 
																+			{
															
 
																+                                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
															
 
																+                                                       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
															
 
																+                                                       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
															
 
																+                                                       0);
															
 
																+                        }
															
 
																+                }
															
 
																+        }
															
 
																+        fprintf(stderr, "Waiting ...\n");
															
 
																+        starpu_task_wait_for_all();
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+        if (display)
															
 
																+	{
															
 
																+                fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
															
 
																+                for(x = 0; x < X; x++)
															
 
																+		{
															
 
																+                        fprintf(stdout, "[%d] ", my_rank);
															
 
																+                        for (y = 0; y < Y; y++)
															
 
																+			{
															
 
																+                                fprintf(stdout, "%3u ", matrix[x][y]);
															
 
																+                        }
															
 
																+                        fprintf(stdout, "\n");
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -0,0 +1,70 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_MPI_H__
															
 
																+#define __STARPU_MPI_H__
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#if defined(STARPU_USE_MPI)
															
 
																+
															
 
																+#include <mpi.h>
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+extern "C" {
															
 
																+#endif
															
 
																+
															
 
																+typedef void *starpu_mpi_req;
															
 
																+
															
 
																+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
															
 
																+int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm);
															
 
																+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm);
															
 
																+int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
															
 
																+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
															
 
																+int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
															
 
																+int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
															
 
																+int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
															
 
																+int starpu_mpi_barrier(MPI_Comm comm);
															
 
																+int starpu_mpi_initialize(void);
															
 
																+int starpu_mpi_initialize_extended(int *rank, int *world_size);
															
 
																+int starpu_mpi_shutdown(void);
															
 
																+
															
 
																+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
															
 
																+void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
															
 
																+void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
															
 
																+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
															
 
																+
															
 
																+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
															
 
																+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
															
 
																+
															
 
																+/* Some helper functions */
															
 
																+
															
 
																+/* When the transfer is completed, the tag is unlocked */
															
 
																+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
															
 
																+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
															
 
																+
															
 
																+/* Asynchronously send an array of buffers, and unlocks the tag once all of
															
 
																+ * them are transmitted. */
															
 
																+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
															
 
																+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#endif // STARPU_USE_MPI
															
 
																+#endif // __STARPU_MPI_H__
															
--- a/mpi/libstarpumpi.pc.in
+++ b/mpi/libstarpumpi.pc.in
@@ -0,0 +1,29 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+prefix=@prefix@
															
 
																+exec_prefix=@exec_prefix@
															
 
																+libdir=@libdir@
															
 
																+includedir=@includedir@
															
 
																+
															
 
																+Name: starpumpi
															
 
																+Description: offers MPI support for heterogeneous multicore architecture
															
 
																+Version: @PACKAGE_VERSION@
															
 
																+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -DSTARPU_USE_DEPRECATED_API
															
 
																+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
															
 
																+Libs.private: @LDFLAGS@ @LIBS@
															
 
																+Requires: libstarpu
															
 
																+Requires.private:
															
--- a/mpi/src/Makefile.am
+++ b/mpi/src/Makefile.am
@@ -0,0 +1,51 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+CC=$(MPICC)
															
 
																+CCLD=$(MPICC)
															
 
																+
															
 
																+BUILT_SOURCES =
															
 
																+
															
 
																+CLEANFILES = *.gcno *.gcda *.linkinfo
															
 
																+
															
 
																+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
															
 
																+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
															
 
																+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
															
 
																+
															
 
																+lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+
															
 
																+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
															
 
																+  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE) \
															
 
																+  $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
															
 
																+noinst_HEADERS =					\
															
 
																+	starpu_mpi_private.h				\
															
 
																+	starpu_mpi_fxt.h				\
															
 
																+	starpu_mpi_stats.h				\
															
 
																+	starpu_mpi_datatype.h
															
 
																+
															
 
																+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
															
 
																+	starpu_mpi.c					\
															
 
																+	starpu_mpi_helper.c				\
															
 
																+	starpu_mpi_datatype.c				\
															
 
																+	starpu_mpi_insert_task.c			\
															
 
																+	starpu_mpi_collective.c				\
															
 
																+	starpu_mpi_stats.c
															
 
																+
															
 
																+
															
 
																+showcheck:
															
 
																+	-cat /dev/null
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -0,0 +1,867 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <stdlib.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <starpu_mpi_datatype.h>
															
 
																+//#define STARPU_MPI_VERBOSE	1
															
 
																+#include <starpu_mpi_private.h>
															
 
																+#include <starpu_profiling.h>
															
 
																+#include <starpu_mpi_stats.h>
															
 
																+
															
 
																+/* TODO find a better way to select the polling method (perhaps during the
															
 
																+ * configuration) */
															
 
																+//#define USE_STARPU_ACTIVITY	1
															
 
																+
															
 
																+static void submit_mpi_req(void *arg);
															
 
																+static void handle_request_termination(struct _starpu_mpi_req *req);
															
 
																+
															
 
																+/* The list of requests that have been newly submitted by the application */
															
 
																+static struct _starpu_mpi_req_list *new_requests;
															
 
																+
															
 
																+/* The list of detached requests that have already been submitted to MPI */
															
 
																+static struct _starpu_mpi_req_list *detached_requests;
															
 
																+static pthread_mutex_t detached_requests_mutex;
															
 
																+
															
 
																+/* Condition to wake up progression thread */
															
 
																+static pthread_cond_t cond_progression;
															
 
																+/* Condition to wake up waiting for all current MPI requests to finish */
															
 
																+static pthread_cond_t cond_finished;
															
 
																+static pthread_mutex_t mutex;
															
 
																+static pthread_t progress_thread;
															
 
																+static int running = 0;
															
 
																+
															
 
																+/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
															
 
																+static pthread_mutex_t mutex_posted_requests;
															
 
																+static int posted_requests = 0, newer_requests, barrier_running = 0;
															
 
																+
															
 
																+#define INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
															
 
																+
															
 
																+/*
															
 
																+ *	Isend
															
 
																+ */
															
 
																+
															
 
																+static void starpu_mpi_isend_func(struct _starpu_mpi_req *req)
															
 
																+{
															
 
																+	int count;
															
 
																+
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+
															
 
																+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype, &count);
															
 
																+	if (req->needs_unpacking)
															
 
																+		starpu_handle_pack_data(req->data_handle, &req->ptr);
															
 
																+	else
															
 
																+		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
															
 
																+	STARPU_ASSERT(req->ptr);
															
 
																+
															
 
																+        _STARPU_MPI_DEBUG("post MPI isend tag %d dst %d ptr %p datatype %p count %d req %p\n", req->mpi_tag, req->srcdst, req->ptr, req->datatype, count, &req->request);
															
 
																+
															
 
																+	_starpu_mpi_comm_amounts_inc(req->comm, req->srcdst, req->datatype, count);
															
 
																+
															
 
																+        req->ret = MPI_Isend(req->ptr, count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
															
 
																+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+
															
 
																+	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
															
 
																+
															
 
																+	/* somebody is perhaps waiting for the MPI request to be posted */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
 
																+	req->submitted = 1;
															
 
																+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
															
 
																+							int dest, int mpi_tag, MPI_Comm comm,
															
 
																+							unsigned detached, void (*callback)(void *), void *arg)
															
 
																+{
															
 
																+	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																+	STARPU_ASSERT(req);
															
 
																+
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+
															
 
																+        INC_POSTED_REQUESTS(1);
															
 
																+
															
 
																+	/* Initialize the request structure */
															
 
																+	req->submitted = 0;
															
 
																+	req->completed = 0;
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
															
 
																+	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
															
 
																+
															
 
																+	req->request_type = SEND_REQ;
															
 
																+
															
 
																+	req->data_handle = data_handle;
															
 
																+	req->srcdst = dest;
															
 
																+	req->mpi_tag = mpi_tag;
															
 
																+	req->comm = comm;
															
 
																+	req->func = starpu_mpi_isend_func;
															
 
																+
															
 
																+	req->detached = detached;
															
 
																+	req->callback = callback;
															
 
																+	req->callback_arg = arg;
															
 
																+
															
 
																+	/* Asynchronously request StarPU to fetch the data in main memory: when
															
 
																+	 * it is available in main memory, submit_mpi_req(req) is called and
															
 
																+	 * the request is actually submitted  */
															
 
																+	starpu_data_acquire_cb(data_handle, STARPU_R, submit_mpi_req, (void *)req);
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return req;
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	STARPU_ASSERT(public_req);
															
 
																+
															
 
																+	struct _starpu_mpi_req *req;
															
 
																+	req = _starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 0, NULL, NULL);
															
 
																+
															
 
																+	STARPU_ASSERT(req);
															
 
																+	*public_req = req;
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Isend (detached)
															
 
																+ */
															
 
																+
															
 
																+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
															
 
																+				int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	_starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 1, callback, arg);
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Irecv
															
 
																+ */
															
 
																+
															
 
																+static void starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
															
 
																+{
															
 
																+	int count;
															
 
																+
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+
															
 
																+	req->needs_unpacking = starpu_mpi_handle_to_datatype(req->data_handle, &req->datatype, &count);
															
 
																+	if (req->needs_unpacking == 1)
															
 
																+		req->ptr = malloc(count);
															
 
																+	else
															
 
																+		req->ptr = starpu_handle_get_local_ptr(req->data_handle);
															
 
																+	STARPU_ASSERT(req->ptr);
															
 
																+
															
 
																+	_STARPU_MPI_DEBUG("post MPI irecv tag %d src %d data %p ptr %p req %p datatype %p\n", req->mpi_tag, req->srcdst, req->data_handle, req->ptr, &req->request, req->datatype);
															
 
																+
															
 
																+        req->ret = MPI_Irecv(req->ptr, count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
															
 
																+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+
															
 
																+	/* somebody is perhaps waiting for the MPI request to be posted */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
 
																+	req->submitted = 1;
															
 
																+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																+	STARPU_ASSERT(req);
															
 
																+
															
 
																+        INC_POSTED_REQUESTS(1);
															
 
																+
															
 
																+	/* Initialize the request structure */
															
 
																+	req->submitted = 0;
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
															
 
																+	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
															
 
																+
															
 
																+	req->request_type = RECV_REQ;
															
 
																+
															
 
																+	req->data_handle = data_handle;
															
 
																+	req->srcdst = source;
															
 
																+	req->mpi_tag = mpi_tag;
															
 
																+	req->comm = comm;
															
 
																+
															
 
																+	req->detached = detached;
															
 
																+	req->callback = callback;
															
 
																+	req->callback_arg = arg;
															
 
																+
															
 
																+	req->func = starpu_mpi_irecv_func;
															
 
																+
															
 
																+	/* Asynchronously request StarPU to fetch the data in main memory: when
															
 
																+	 * it is available in main memory, submit_mpi_req(req) is called and
															
 
																+	 * the request is actually submitted  */
															
 
																+	starpu_data_acquire_cb(data_handle, STARPU_W, submit_mpi_req, (void *)req);
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return req;
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	STARPU_ASSERT(public_req);
															
 
																+
															
 
																+	struct _starpu_mpi_req *req;
															
 
																+	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL);
															
 
																+
															
 
																+	STARPU_ASSERT(req);
															
 
																+	*public_req = req;
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Irecv (detached)
															
 
																+ */
															
 
																+
															
 
																+int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+/*
															
 
																+ *	Recv
															
 
																+ */
															
 
																+
															
 
																+int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
															
 
																+{
															
 
																+	starpu_mpi_req req;
															
 
																+
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	starpu_mpi_irecv(data_handle, &req, source, mpi_tag, comm);
															
 
																+	starpu_mpi_wait(&req, status);
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Send
															
 
																+ */
															
 
																+
															
 
																+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
															
 
																+{
															
 
																+	starpu_mpi_req req;
															
 
																+	MPI_Status status;
															
 
																+
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	memset(&status, 0, sizeof(MPI_Status));
															
 
																+
															
 
																+	starpu_mpi_isend(data_handle, &req, dest, mpi_tag, comm);
															
 
																+	starpu_mpi_wait(&req, &status);
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Wait
															
 
																+ */
															
 
																+
															
 
																+static void starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	/* Which is the mpi request we are waiting for ? */
															
 
																+	struct _starpu_mpi_req *req = waiting_req->other_request;
															
 
																+
															
 
																+	req->ret = MPI_Wait(&req->request, waiting_req->status);
															
 
																+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+
															
 
																+	handle_request_termination(req);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	int ret;
															
 
																+	struct _starpu_mpi_req *waiting_req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																+	STARPU_ASSERT(waiting_req);
															
 
																+	struct _starpu_mpi_req *req = *public_req;
															
 
																+
															
 
																+        INC_POSTED_REQUESTS(1);
															
 
																+
															
 
																+	/* We cannot try to complete a MPI request that was not actually posted
															
 
																+	 * to MPI yet. */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&(req->req_mutex));
															
 
																+	while (!(req->submitted))
															
 
																+		_STARPU_PTHREAD_COND_WAIT(&(req->req_cond), &(req->req_mutex));
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
															
 
																+
															
 
																+	/* Initialize the request structure */
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&(waiting_req->req_mutex), NULL);
															
 
																+	_STARPU_PTHREAD_COND_INIT(&(waiting_req->req_cond), NULL);
															
 
																+	waiting_req->status = status;
															
 
																+	waiting_req->other_request = req;
															
 
																+	waiting_req->func = starpu_mpi_wait_func;
															
 
																+	waiting_req->request_type = WAIT_REQ;
															
 
																+
															
 
																+	submit_mpi_req(waiting_req);
															
 
																+
															
 
																+	/* We wait for the MPI request to finish */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
 
																+	while (!req->completed)
															
 
																+		_STARPU_PTHREAD_COND_WAIT(&req->req_cond, &req->req_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
															
 
																+
															
 
																+	ret = req->ret;
															
 
																+
															
 
																+	/* The internal request structure was automatically allocated */
															
 
																+	*public_req = NULL;
															
 
																+	free(req);
															
 
																+
															
 
																+        //free(waiting_req);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ * 	Test
															
 
																+ */
															
 
																+
															
 
																+static void starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	/* Which is the mpi request we are testing for ? */
															
 
																+	struct _starpu_mpi_req *req = testing_req->other_request;
															
 
																+
															
 
																+        _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
															
 
																+	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
															
 
																+        STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+
															
 
																+	if (*testing_req->flag)
															
 
																+	{
															
 
																+		testing_req->ret = req->ret;
															
 
																+		handle_request_termination(req);
															
 
																+	}
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&testing_req->req_mutex);
															
 
																+	testing_req->completed = 1;
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(&testing_req->req_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->req_mutex);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	int ret = 0;
															
 
																+
															
 
																+	STARPU_ASSERT(public_req);
															
 
																+
															
 
																+	struct _starpu_mpi_req *req = *public_req;
															
 
																+
															
 
																+	STARPU_ASSERT(!req->detached);
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
 
																+	unsigned submitted = req->submitted;
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
															
 
																+
															
 
																+	if (submitted)
															
 
																+	{
															
 
																+		struct _starpu_mpi_req *testing_req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																+                STARPU_ASSERT(testing_req);
															
 
																+                //		memset(testing_req, 0, sizeof(struct _starpu_mpi_req));
															
 
																+
															
 
																+		/* Initialize the request structure */
															
 
																+		_STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
															
 
																+		_STARPU_PTHREAD_COND_INIT(&(testing_req->req_cond), NULL);
															
 
																+		testing_req->flag = flag;
															
 
																+		testing_req->status = status;
															
 
																+		testing_req->other_request = req;
															
 
																+		testing_req->func = starpu_mpi_test_func;
															
 
																+		testing_req->completed = 0;
															
 
																+                testing_req->request_type = TEST_REQ;
															
 
																+
															
 
																+                INC_POSTED_REQUESTS(1);
															
 
																+                submit_mpi_req(testing_req);
															
 
																+
															
 
																+		/* We wait for the test request to finish */
															
 
																+		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
															
 
																+		while (!(testing_req->completed))
															
 
																+                        _STARPU_PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(&(testing_req->req_mutex));
															
 
																+
															
 
																+		ret = testing_req->ret;
															
 
																+
															
 
																+		if (*(testing_req->flag))
															
 
																+		{
															
 
																+			/* The request was completed so we free the internal
															
 
																+			 * request structure which was automatically allocated
															
 
																+			 * */
															
 
																+			*public_req = NULL;
															
 
																+			free(req);
															
 
																+		}
															
 
																+	}
															
 
																+	else {
															
 
																+		*flag = 0;
															
 
																+	}
															
 
																+
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Barrier
															
 
																+ */
															
 
																+
															
 
																+static void starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+
															
 
																+	barrier_req->ret = MPI_Barrier(barrier_req->comm);
															
 
																+        STARPU_ASSERT(barrier_req->ret == MPI_SUCCESS);
															
 
																+
															
 
																+	handle_request_termination(barrier_req);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_barrier(MPI_Comm comm)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	int ret;
															
 
																+	struct _starpu_mpi_req *barrier_req = calloc(1, sizeof(struct _starpu_mpi_req));
															
 
																+	STARPU_ASSERT(barrier_req);
															
 
																+
															
 
																+	/* First wait for *both* all tasks and MPI requests to finish, in case
															
 
																+	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
															
 
																+	 */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
															
 
																+	barrier_running = 1;
															
 
																+	do {
															
 
																+		while (posted_requests)
															
 
																+			/* Wait for all current MPI requests to finish */
															
 
																+			_STARPU_PTHREAD_COND_WAIT(&cond_finished, &mutex);
															
 
																+		/* No current request, clear flag */
															
 
																+		newer_requests = 0;
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+		/* Now wait for all tasks */
															
 
																+		starpu_task_wait_for_all();
															
 
																+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+		/* Check newer_requests again, in case some MPI requests
															
 
																+		 * triggered by tasks completed and triggered tasks between
															
 
																+		 * wait_for_all finished and we take the lock */
															
 
																+	} while (posted_requests || newer_requests);
															
 
																+	barrier_running = 0;
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+	/* Initialize the request structure */
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&(barrier_req->req_mutex), NULL);
															
 
																+	_STARPU_PTHREAD_COND_INIT(&(barrier_req->req_cond), NULL);
															
 
																+	barrier_req->func = starpu_mpi_barrier_func;
															
 
																+	barrier_req->request_type = BARRIER_REQ;
															
 
																+	barrier_req->comm = comm;
															
 
																+
															
 
																+        INC_POSTED_REQUESTS(1);
															
 
																+	submit_mpi_req(barrier_req);
															
 
																+
															
 
																+	/* We wait for the MPI request to finish */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
															
 
																+	while (!barrier_req->completed)
															
 
																+		_STARPU_PTHREAD_COND_WAIT(&barrier_req->req_cond, &barrier_req->req_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->req_mutex);
															
 
																+
															
 
																+	ret = barrier_req->ret;
															
 
																+
															
 
																+        //free(waiting_req);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Requests
															
 
																+ */
															
 
																+
															
 
																+#ifdef STARPU_MPI_VERBOSE
															
 
																+static char *starpu_mpi_request_type(unsigned request_type)
															
 
																+{
															
 
																+        switch (request_type)
															
 
																+                {
															
 
																+                case SEND_REQ: return "send";
															
 
																+                case RECV_REQ: return "recv";
															
 
																+                case WAIT_REQ: return "wait";
															
 
																+                case TEST_REQ: return "test";
															
 
																+                case BARRIER_REQ: return "barrier";
															
 
																+                default: return "unknown request type";
															
 
																+                }
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+static void handle_request_termination(struct _starpu_mpi_req *req)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+
															
 
																+	_STARPU_MPI_DEBUG("complete MPI (%s %d) data %p req %p - tag %d\n", starpu_mpi_request_type(req->request_type), req->srcdst, req->data_handle, &req->request, req->mpi_tag);
															
 
																+        if (req->request_type != BARRIER_REQ) {
															
 
																+		if (req->needs_unpacking)
															
 
																+			starpu_handle_unpack_data(req->data_handle, req->ptr);
															
 
																+		else
															
 
																+			MPI_Type_free(&req->datatype);
															
 
																+                starpu_data_release(req->data_handle);
															
 
																+        }
															
 
																+
															
 
																+	if (req->request_type == RECV_REQ)
															
 
																+	{
															
 
																+		TRACE_MPI_IRECV_END(req->srcdst, req->mpi_tag);
															
 
																+	}
															
 
																+
															
 
																+	/* Execute the specified callback, if any */
															
 
																+	if (req->callback)
															
 
																+		req->callback(req->callback_arg);
															
 
																+
															
 
																+	/* tell anyone potentiallly waiting on the request that it is
															
 
																+	 * terminated now */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
 
																+	req->completed = 1;
															
 
																+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+static void submit_mpi_req(void *arg)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	struct _starpu_mpi_req *req = arg;
															
 
																+
															
 
																+        INC_POSTED_REQUESTS(-1);
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	_starpu_mpi_req_list_push_front(new_requests, req);
															
 
																+	newer_requests = 1;
															
 
																+        _STARPU_MPI_DEBUG("Pushing new request type %d\n", req->request_type);
															
 
																+	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Scheduler hook
															
 
																+ */
															
 
																+
															
 
																+#ifdef USE_STARPU_ACTIVITY
															
 
																+static unsigned progression_hook_func(void *arg __attribute__((unused)))
															
 
																+{
															
 
																+	unsigned may_block = 1;
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	if (!_starpu_mpi_req_list_empty(detached_requests))
															
 
																+	{
															
 
																+		_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
															
 
																+		may_block = 0;
															
 
																+	}
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+	return may_block;
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+/*
															
 
																+ *	Progression loop
															
 
																+ */
															
 
																+
															
 
																+static void test_detached_requests(void)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	int flag;
															
 
																+	MPI_Status status;
															
 
																+	struct _starpu_mpi_req *req, *next_req;
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
															
 
																+
															
 
																+	for (req = _starpu_mpi_req_list_begin(detached_requests);
															
 
																+		req != _starpu_mpi_req_list_end(detached_requests);
															
 
																+		req = next_req)
															
 
																+	{
															
 
																+		next_req = _starpu_mpi_req_list_next(req);
															
 
																+
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
															
 
																+
															
 
																+                //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
															
 
																+		req->ret = MPI_Test(&req->request, &flag, &status);
															
 
																+		STARPU_ASSERT(req->ret == MPI_SUCCESS);
															
 
																+
															
 
																+		if (flag)
															
 
																+		{
															
 
																+			handle_request_termination(req);
															
 
																+		}
															
 
																+
															
 
																+		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
															
 
																+
															
 
																+		if (flag)
															
 
																+			_starpu_mpi_req_list_erase(detached_requests, req);
															
 
																+
															
 
																+#ifdef STARPU_DEVEL
															
 
																+#warning TODO fix memleak
															
 
																+#endif
															
 
																+		/* Detached requests are automatically allocated by the lib */
															
 
																+		//if (req->detached)
															
 
																+		//	free(req);
															
 
																+	}
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+static void handle_new_request(struct _starpu_mpi_req *req)
															
 
																+{
															
 
																+        _STARPU_MPI_LOG_IN();
															
 
																+	STARPU_ASSERT(req);
															
 
																+
															
 
																+	/* submit the request to MPI */
															
 
																+        _STARPU_MPI_DEBUG("Handling new request type %d\n", req->request_type);
															
 
																+	req->func(req);
															
 
																+
															
 
																+	if (req->detached)
															
 
																+	{
															
 
																+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+		_starpu_mpi_req_list_push_front(detached_requests, req);
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+		starpu_wake_all_blocked_workers();
															
 
																+
															
 
																+		/* put the submitted request into the list of pending requests
															
 
																+		 * so that it can be handled by the progression mechanisms */
															
 
																+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+		_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+	}
															
 
																+        _STARPU_MPI_LOG_OUT();
															
 
																+}
															
 
																+
															
 
																+static void *progress_thread_func(void *arg)
															
 
																+{
															
 
																+        int initialize_mpi = *((int *) arg);
															
 
																+
															
 
																+        _STARPU_DEBUG("Initialize mpi: %d\n", initialize_mpi);
															
 
																+
															
 
																+        if (initialize_mpi) {
															
 
																+#ifdef STARPU_DEVEL
															
 
																+#warning get real argc and argv from the application
															
 
																+#endif
															
 
																+                int argc = 0;
															
 
																+                char **argv = NULL;
															
 
																+                int thread_support;
															
 
																+                _STARPU_DEBUG("Calling MPI_Init_thread\n");
															
 
																+                if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
															
 
																+                        fprintf(stderr,"MPI_Init_thread failed\n");
															
 
																+                        exit(1);
															
 
																+                }
															
 
																+                if (thread_support == MPI_THREAD_FUNNELED)
															
 
																+                        fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
															
 
																+                if (thread_support < MPI_THREAD_FUNNELED)
															
 
																+                        fprintf(stderr,"Warning: MPI does not have thread support!\n");
															
 
																+        }
															
 
																+
															
 
																+	/* notify the main thread that the progression thread is ready */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	running = 1;
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests))) {
															
 
																+		/* shall we block ? */
															
 
																+		unsigned block = _starpu_mpi_req_list_empty(new_requests);
															
 
																+
															
 
																+#ifndef USE_STARPU_ACTIVITY
															
 
																+		block = block && _starpu_mpi_req_list_empty(detached_requests);
															
 
																+#endif
															
 
																+
															
 
																+		if (block)
															
 
																+		{
															
 
																+                        _STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
															
 
																+			if (barrier_running)
															
 
																+				/* Tell mpi_barrier */
															
 
																+				_STARPU_PTHREAD_COND_SIGNAL(&cond_finished);
															
 
																+			_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
															
 
																+		}
															
 
																+
															
 
																+		/* test whether there are some terminated "detached request" */
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+		test_detached_requests();
															
 
																+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+
															
 
																+		/* get one request */
															
 
																+		struct _starpu_mpi_req *req;
															
 
																+		while (!_starpu_mpi_req_list_empty(new_requests))
															
 
																+		{
															
 
																+			req = _starpu_mpi_req_list_pop_back(new_requests);
															
 
																+
															
 
																+			/* handling a request is likely to block for a while
															
 
																+			 * (on a sync_data_with_mem call), we want to let the
															
 
																+			 * application submit requests in the meantime, so we
															
 
																+			 * release the lock.  */
															
 
																+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+			handle_new_request(req);
															
 
																+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	STARPU_ASSERT(_starpu_mpi_req_list_empty(detached_requests));
															
 
																+	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
															
 
																+        STARPU_ASSERT(posted_requests == 0);
															
 
																+
															
 
																+        if (initialize_mpi) {
															
 
																+                _STARPU_MPI_DEBUG("Calling MPI_Finalize()\n");
															
 
																+                MPI_Finalize();
															
 
																+        }
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+	return NULL;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	(De)Initialization methods
															
 
																+ */
															
 
																+
															
 
																+#ifdef USE_STARPU_ACTIVITY
															
 
																+static int hookid = - 1;
															
 
																+#endif
															
 
																+
															
 
																+static void _starpu_mpi_add_sync_point_in_fxt(void)
															
 
																+{
															
 
																+#ifdef STARPU_USE_FXT
															
 
																+	int rank;
															
 
																+	int worldsize;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
															
 
																+
															
 
																+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
															
 
																+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
															
 
																+
															
 
																+	/* We generate a "unique" key so that we can make sure that different
															
 
																+	 * FxT traces come from the same MPI run. */
															
 
																+	int random_number;
															
 
																+
															
 
																+	/* XXX perhaps we don't want to generate a new seed if the application
															
 
																+	 * specified some reproductible behaviour ? */
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		srand(time(NULL));
															
 
																+		random_number = rand();
															
 
																+	}
															
 
																+
															
 
																+	MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
															
 
																+
															
 
																+	TRACE_MPI_BARRIER(rank, worldsize, random_number);
															
 
																+
															
 
																+        _STARPU_MPI_DEBUG("unique key %x\n", random_number);
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
															
 
																+{
															
 
																+#ifndef STARPU_MPI_CACHE
															
 
																+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --disable-mpi-cache\n");
															
 
																+#endif
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
															
 
																+	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
															
 
																+	_STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
															
 
																+	new_requests = _starpu_mpi_req_list_new();
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
															
 
																+	detached_requests = _starpu_mpi_req_list_new();
															
 
																+
															
 
																+        _STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
															
 
																+
															
 
																+	_STARPU_PTHREAD_CREATE(&progress_thread, NULL,
															
 
																+			       progress_thread_func, (void *)&initialize_mpi);
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	while (!running)
															
 
																+		_STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+        if (rank && world_size) {
															
 
																+                _STARPU_DEBUG("Calling MPI_Comm_rank\n");
															
 
																+                MPI_Comm_rank(MPI_COMM_WORLD, rank);
															
 
																+                MPI_Comm_size(MPI_COMM_WORLD, world_size);
															
 
																+        }
															
 
																+
															
 
																+#ifdef STARPU_USE_FXT
															
 
																+	int prank;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &prank);
															
 
																+	starpu_set_profiling_id(prank);
															
 
																+#endif //STARPU_USE_FXT
															
 
																+
															
 
																+#ifdef USE_STARPU_ACTIVITY
															
 
																+	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
															
 
																+	STARPU_ASSERT(hookid >= 0);
															
 
																+#endif
															
 
																+
															
 
																+	_starpu_mpi_add_sync_point_in_fxt();
															
 
																+	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_initialize(void)
															
 
																+{
															
 
																+        return _starpu_mpi_initialize(0, NULL, NULL);
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_initialize_extended(int *rank, int *world_size)
															
 
																+{
															
 
																+        return _starpu_mpi_initialize(1, rank, world_size);
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_shutdown(void)
															
 
																+{
															
 
																+	void *value;
															
 
																+	int rank;
															
 
																+
															
 
																+	/* We need to get the  rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+
															
 
																+	/* kill the progression thread */
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	running = 0;
															
 
																+	_STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+
															
 
																+	pthread_join(progress_thread, &value);
															
 
																+
															
 
																+#ifdef USE_STARPU_ACTIVITY
															
 
																+	starpu_progression_hook_deregister(hookid);
															
 
																+#endif
															
 
																+
															
 
																+	/* free the request queues */
															
 
																+	_starpu_mpi_req_list_delete(detached_requests);
															
 
																+	_starpu_mpi_req_list_delete(new_requests);
															
 
																+
															
 
																+	_starpu_mpi_comm_amounts_display(rank);
															
 
																+	_starpu_mpi_comm_amounts_free();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
--- a/mpi/src/starpu_mpi_collective.c
+++ b/mpi/src/starpu_mpi_collective.c
@@ -0,0 +1,78 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <mpi.h>
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+
															
 
																+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
															
 
																+{
															
 
																+	int rank;
															
 
																+	int x;
															
 
																+
															
 
																+	MPI_Comm_rank(comm, &rank);
															
 
																+
															
 
																+	for(x = 0; x < count ;  x++)
															
 
																+	{
															
 
																+		if (data_handles[x])
															
 
																+		{
															
 
																+			int owner = starpu_data_get_rank(data_handles[x]);
															
 
																+			int mpi_tag = starpu_data_get_tag(data_handles[x]);
															
 
																+			STARPU_ASSERT(mpi_tag >= 0);
															
 
																+			if ((rank == root) && (owner != root))
															
 
																+			{
															
 
																+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
															
 
																+				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
															
 
																+			}
															
 
																+			if ((rank != root) && (owner == rank))
															
 
																+			{
															
 
																+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
															
 
																+				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
															
 
																+{
															
 
																+	int rank;
															
 
																+	int x;
															
 
																+
															
 
																+	MPI_Comm_rank(comm, &rank);
															
 
																+
															
 
																+	for(x = 0; x < count ;  x++)
															
 
																+	{
															
 
																+		if (data_handles[x])
															
 
																+		{
															
 
																+			int owner = starpu_data_get_rank(data_handles[x]);
															
 
																+			int mpi_tag = starpu_data_get_tag(data_handles[x]);
															
 
																+			STARPU_ASSERT(mpi_tag >= 0);
															
 
																+			if ((rank == root) && (owner != root))
															
 
																+			{
															
 
																+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
															
 
																+				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, NULL, NULL);
															
 
																+			}
															
 
																+			if ((rank != root) && (owner == rank))
															
 
																+			{
															
 
																+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
															
 
																+				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, NULL, NULL);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
--- a/mpi/src/starpu_mpi_datatype.c
+++ b/mpi/src/starpu_mpi_datatype.c
@@ -0,0 +1,149 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi_datatype.h>
															
 
																+
															
 
																+typedef int (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
															
 
																+
															
 
																+/*
															
 
																+ * 	Matrix
															
 
																+ */
															
 
																+
															
 
																+static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+{
															
 
																+	int ret;
															
 
																+
															
 
																+	unsigned nx = starpu_matrix_get_nx(data_handle);
															
 
																+	unsigned ny = starpu_matrix_get_ny(data_handle);
															
 
																+	unsigned ld = starpu_matrix_get_local_ld(data_handle);
															
 
																+	size_t elemsize = starpu_matrix_get_elemsize(data_handle);
															
 
																+
															
 
																+	ret = MPI_Type_vector(ny, nx*elemsize, ld*elemsize, MPI_BYTE, datatype);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	ret = MPI_Type_commit(datatype);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ * 	Block
															
 
																+ */
															
 
																+
															
 
																+static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+{
															
 
																+	int ret;
															
 
																+
															
 
																+	unsigned nx = starpu_block_get_nx(data_handle);
															
 
																+	unsigned ny = starpu_block_get_ny(data_handle);
															
 
																+	unsigned nz = starpu_block_get_nz(data_handle);
															
 
																+	unsigned ldy = starpu_block_get_local_ldy(data_handle);
															
 
																+	unsigned ldz = starpu_block_get_local_ldz(data_handle);
															
 
																+	size_t elemsize = starpu_block_get_elemsize(data_handle);
															
 
																+
															
 
																+	MPI_Datatype datatype_2dlayer;
															
 
																+	ret = MPI_Type_vector(ny, nx*elemsize, ldy*elemsize, MPI_BYTE, &datatype_2dlayer);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	ret = MPI_Type_commit(&datatype_2dlayer);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	ret = MPI_Type_hvector(nz, 1, ldz*elemsize, datatype_2dlayer, datatype);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	ret = MPI_Type_commit(datatype);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ * 	Vector
															
 
																+ */
															
 
																+
															
 
																+static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+{
															
 
																+	int ret;
															
 
																+
															
 
																+	unsigned nx = starpu_vector_get_nx(data_handle);
															
 
																+	size_t elemsize = starpu_vector_get_elemsize(data_handle);
															
 
																+
															
 
																+	ret = MPI_Type_contiguous(nx*elemsize, MPI_BYTE, datatype);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	ret = MPI_Type_commit(datatype);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ * 	Variable
															
 
																+ */
															
 
																+
															
 
																+static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
															
 
																+{
															
 
																+	int ret;
															
 
																+
															
 
																+	size_t elemsize = starpu_variable_get_elemsize(data_handle);
															
 
																+
															
 
																+	ret = MPI_Type_contiguous(elemsize, MPI_BYTE, datatype);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	ret = MPI_Type_commit(datatype);
															
 
																+	STARPU_ASSERT(ret == MPI_SUCCESS);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Generic
															
 
																+ */
															
 
																+
															
 
																+static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
															
 
																+{
															
 
																+	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
															
 
																+	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
															
 
																+	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
															
 
																+	[STARPU_CSR_INTERFACE_ID]	= NULL,
															
 
																+	[STARPU_BCSR_INTERFACE_ID]	= NULL,
															
 
																+	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
															
 
																+	[STARPU_VOID_INTERFACE_ID]      = NULL,
															
 
																+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
															
 
																+};
															
 
																+
															
 
																+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *count)
															
 
																+{
															
 
																+	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
															
 
																+
															
 
																+	if (id <= STARPU_MULTIFORMAT_INTERFACE_ID)
															
 
																+	{
															
 
																+		handle_to_datatype_func func = handle_to_datatype_funcs[id];
															
 
																+		STARPU_ASSERT(func);
															
 
																+		func(data_handle, datatype);
															
 
																+		*count = 1;
															
 
																+		return 0;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* The datatype is not predefined by StarPU */
															
 
																+		*count = starpu_handle_get_size(data_handle);
															
 
																+		*datatype = MPI_BYTE;
															
 
																+		return 1;
															
 
																+	}
															
 
																+}
															
--- a/mpi/src/starpu_mpi_datatype.h
+++ b/mpi/src/starpu_mpi_datatype.h
@@ -0,0 +1,33 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_MPI_DATATYPE_H__
															
 
																+#define __STARPU_MPI_DATATYPE_H__
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+extern "C" {
															
 
																+#endif
															
 
																+
															
 
																+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *count);
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#endif // __STARPU_MPI_DATATYPE_H__
															
--- a/mpi/src/starpu_mpi_fxt.h
+++ b/mpi/src/starpu_mpi_fxt.h
@@ -0,0 +1,45 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_MPI_FXT_H__
															
 
																+#define __STARPU_MPI_FXT_H__
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <common/config.h>
															
 
																+#include <common/fxt.h>
															
 
																+
															
 
																+#define FUT_MPI_BARRIER		0x5201
															
 
																+#define FUT_MPI_ISEND		0x5202
															
 
																+#define FUT_MPI_IRECV_END	0x5203
															
 
																+
															
 
																+#ifdef STARPU_USE_FXT
															
 
																+#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
															
 
																+	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
															
 
																+#define TRACE_MPI_ISEND(dest, mpi_tag, size)	\
															
 
																+	FUT_DO_PROBE4(FUT_MPI_ISEND, (dest), (mpi_tag), (size), _starpu_gettid());
															
 
																+#define TRACE_MPI_IRECV_END(src, mpi_tag)	\
															
 
																+	FUT_DO_PROBE3(FUT_MPI_IRECV_END, (src), (mpi_tag), _starpu_gettid());
															
 
																+#define TRACE
															
 
																+#else
															
 
																+#define TRACE_MPI_BARRIER(a, b, c)	do {} while(0);
															
 
																+#define TRACE_MPI_ISEND(a, b, c)	do {} while(0);
															
 
																+#define TRACE_MPI_IRECV_END(a, b)	do {} while(0);
															
 
																+#endif
															
 
																+
															
 
																+
															
 
																+
															
 
																+#endif // __STARPU_MPI_FXT_H__
															
--- a/mpi/src/starpu_mpi_helper.c
+++ b/mpi/src/starpu_mpi_helper.c
@@ -0,0 +1,104 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+
															
 
																+static void starpu_mpi_unlock_tag_callback(void *arg)
															
 
																+{
															
 
																+	starpu_tag_t *tagptr = arg;
															
 
																+
															
 
																+	starpu_tag_notify_from_apps(*tagptr);
															
 
																+
															
 
																+	free(tagptr);
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle,
															
 
																+				int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
															
 
																+{
															
 
																+	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
															
 
																+	*tagptr = tag;
															
 
																+	
															
 
																+	return starpu_mpi_isend_detached(data_handle, dest, mpi_tag, comm,
															
 
																+						starpu_mpi_unlock_tag_callback, tagptr);
															
 
																+}
															
 
																+
															
 
																+
															
 
																+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
															
 
																+{
															
 
																+	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
															
 
																+	*tagptr = tag;
															
 
																+	
															
 
																+	return starpu_mpi_irecv_detached(data_handle, source, mpi_tag, comm,
															
 
																+						starpu_mpi_unlock_tag_callback, tagptr);
															
 
																+}
															
 
																+
															
 
																+struct arg_array {
															
 
																+	int array_size;
															
 
																+	starpu_tag_t tag;
															
 
																+};
															
 
																+
															
 
																+static void starpu_mpi_array_unlock_callback(void *_arg)
															
 
																+{
															
 
																+	struct arg_array *arg = _arg;
															
 
																+
															
 
																+	int remaining = STARPU_ATOMIC_ADD(&arg->array_size, -1);
															
 
																+
															
 
																+	if (remaining == 0)
															
 
																+	{
															
 
																+		starpu_tag_notify_from_apps(arg->tag);
															
 
																+		free(arg);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
															
 
																+		starpu_data_handle_t *data_handle, int *dest, int *mpi_tag,
															
 
																+		MPI_Comm *comm, starpu_tag_t tag)
															
 
																+{
															
 
																+	struct arg_array *arg = malloc(sizeof(struct arg_array));
															
 
																+
															
 
																+	arg->array_size = array_size;
															
 
																+	arg->tag = tag;
															
 
																+
															
 
																+	unsigned elem;
															
 
																+	for (elem = 0; elem < array_size; elem++)
															
 
																+	{
															
 
																+		starpu_mpi_isend_detached(data_handle[elem], dest[elem],
															
 
																+				mpi_tag[elem], comm[elem],
															
 
																+				starpu_mpi_array_unlock_callback, arg);
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
															
 
																+{
															
 
																+	struct arg_array *arg = malloc(sizeof(struct arg_array));
															
 
																+
															
 
																+	arg->array_size = array_size;
															
 
																+	arg->tag = tag;
															
 
																+
															
 
																+	unsigned elem;
															
 
																+	for (elem = 0; elem < array_size; elem++)
															
 
																+	{
															
 
																+		starpu_mpi_irecv_detached(data_handle[elem], source[elem],
															
 
																+				mpi_tag[elem], comm[elem],
															
 
																+				starpu_mpi_array_unlock_callback, arg);
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/src/starpu_mpi_insert_task.c
+++ b/mpi/src/starpu_mpi_insert_task.c
@@ -0,0 +1,632 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011-2012  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <stdarg.h>
															
 
																+#include <mpi.h>
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_data.h>
															
 
																+#include <common/utils.h>
															
 
																+#include <common/uthash.h>
															
 
																+#include <util/starpu_insert_task_utils.h>
															
 
																+#include <datawizard/coherency.h>
															
 
																+
															
 
																+//#define STARPU_MPI_VERBOSE 1
															
 
																+#include <starpu_mpi_private.h>
															
 
																+
															
 
																+#ifdef STARPU_MPI_CACHE
															
 
																+/* Whether we are allowed to keep copies of remote data. */
															
 
																+struct _starpu_data_entry
															
 
																+{
															
 
																+	UT_hash_handle hh;
															
 
																+	void *data;
															
 
																+};
															
 
																+
															
 
																+struct _starpu_data_entry **sent_data = NULL;
															
 
																+struct _starpu_data_entry **received_data = NULL;
															
 
																+#endif /* STARPU_MPI_CACHE */
															
 
																+
															
 
																+static void _starpu_mpi_tables_init()
															
 
																+{
															
 
																+#ifdef STARPU_MPI_CACHE
															
 
																+	if (sent_data == NULL) {
															
 
																+		int nb_nodes;
															
 
																+		int i;
															
 
																+
															
 
																+		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
															
 
																+		_STARPU_MPI_DEBUG("Initialising htable for cache\n");
															
 
																+		sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
															
 
																+		for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
															
 
																+		received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
															
 
																+		for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
															
 
																+	}
															
 
																+#endif /* STARPU_MPI_CACHE */
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
															
 
																+{
															
 
																+	if (data && mode & STARPU_R) {
															
 
																+		struct starpu_data_interface_ops *ops;
															
 
																+		int rank = starpu_data_get_rank(data);
															
 
																+
															
 
																+		ops = data->ops;
															
 
																+		size_on_nodes[rank] += ops->get_size(data);
															
 
																+	}
															
 
																+
															
 
																+	if (mode & STARPU_W) {
															
 
																+		if (!data) {
															
 
																+			/* We don't have anything allocated for this.
															
 
																+			 * The application knows we won't do anything
															
 
																+			 * about this task */
															
 
																+			/* Yes, the app could actually not call
															
 
																+			 * insert_task at all itself, this is just a
															
 
																+			 * safeguard. */
															
 
																+			_STARPU_MPI_DEBUG("oh oh\n");
															
 
																+			_STARPU_MPI_LOG_OUT();
															
 
																+			return -EINVAL;
															
 
																+		}
															
 
																+		int mpi_rank = starpu_data_get_rank(data);
															
 
																+		if (mpi_rank == me) {
															
 
																+			if (*do_execute == 0) {
															
 
																+				*inconsistent_execute = 1;
															
 
																+			}
															
 
																+			else {
															
 
																+				*do_execute = 1;
															
 
																+			}
															
 
																+		}
															
 
																+		else if (mpi_rank != -1) {
															
 
																+			if (*do_execute == 1) {
															
 
																+				*inconsistent_execute = 1;
															
 
																+			}
															
 
																+			else {
															
 
																+				*do_execute = 0;
															
 
																+				*dest = mpi_rank;
															
 
																+				/* That's the rank which needs the data to be sent to */
															
 
																+			}
															
 
																+		}
															
 
																+		else {
															
 
																+			_STARPU_ERROR("rank invalid\n");
															
 
																+		}
															
 
																+	}
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
															
 
																+{
															
 
																+	if (data && mode & STARPU_R) {
															
 
																+		int mpi_rank = starpu_data_get_rank(data);
															
 
																+		int mpi_tag = starpu_data_get_tag(data);
															
 
																+		if(mpi_rank == -1) {
															
 
																+			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
															
 
																+			STARPU_ABORT();
															
 
																+		}
															
 
																+		if(mpi_tag == -1) {
															
 
																+			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
															
 
																+			STARPU_ABORT();
															
 
																+		}
															
 
																+		/* The task needs to read this data */
															
 
																+		if (do_execute && mpi_rank != me && mpi_rank != -1) {
															
 
																+			/* I will have to execute but I don't have the data, receive */
															
 
																+#ifdef STARPU_MPI_CACHE
															
 
																+			struct _starpu_data_entry *already_received;
															
 
																+			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
															
 
																+			if (already_received == NULL) {
															
 
																+				struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
															
 
																+				entry->data = data;
															
 
																+				HASH_ADD_PTR(received_data[mpi_rank], data, entry);
															
 
																+			}
															
 
																+			else {
															
 
																+				_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
															
 
																+			}
															
 
																+			if (!already_received)
															
 
																+#endif
															
 
																+			{
															
 
																+				_STARPU_MPI_DEBUG("Receive data %p from %d\n", data, mpi_rank);
															
 
																+				starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
															
 
																+			}
															
 
																+		}
															
 
																+		if (!do_execute && mpi_rank == me) {
															
 
																+			/* Somebody else will execute it, and I have the data, send it. */
															
 
																+#ifdef STARPU_MPI_CACHE
															
 
																+			struct _starpu_data_entry *already_sent;
															
 
																+			HASH_FIND_PTR(sent_data[dest], &data, already_sent);
															
 
																+			if (already_sent == NULL) {
															
 
																+				struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
															
 
																+				entry->data = data;
															
 
																+				HASH_ADD_PTR(sent_data[dest], data, entry);
															
 
																+				_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
															
 
																+			}
															
 
																+			else {
															
 
																+				_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
															
 
																+			}
															
 
																+			if (!already_sent)
															
 
																+#endif
															
 
																+			{
															
 
																+				_STARPU_MPI_DEBUG("Send data %p to %d\n", data, dest);
															
 
																+				starpu_mpi_isend_detached(data, dest, mpi_tag, comm, NULL, NULL);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static
															
 
																+void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int xrank, int dest, int do_execute, MPI_Comm comm)
															
 
																+{
															
 
																+	if (mode & STARPU_W) {
															
 
																+		int mpi_rank = starpu_data_get_rank(data);
															
 
																+		int mpi_tag = starpu_data_get_tag(data);
															
 
																+		if(mpi_rank == -1) {
															
 
																+			fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
															
 
																+			STARPU_ABORT();
															
 
																+		}
															
 
																+		if(mpi_tag == -1) {
															
 
																+			fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
															
 
																+			STARPU_ABORT();
															
 
																+		}
															
 
																+		if (mpi_rank == me) {
															
 
																+			if (xrank != -1 && me != xrank) {
															
 
																+				_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
															
 
																+				starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
															
 
																+			}
															
 
																+		}
															
 
																+		else if (do_execute) {
															
 
																+			_STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
															
 
																+			starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
															
 
																+{
															
 
																+#ifdef STARPU_MPI_CACHE
															
 
																+	if (mode & STARPU_W) {
															
 
																+		if (do_execute) {
															
 
																+			/* Note that all copies I've sent to neighbours are now invalid */
															
 
																+			int n, size;
															
 
																+			MPI_Comm_size(comm, &size);
															
 
																+			for(n=0 ; n<size ; n++) {
															
 
																+				struct _starpu_data_entry *already_sent;
															
 
																+				HASH_FIND_PTR(sent_data[n], &data, already_sent);
															
 
																+				if (already_sent) {
															
 
																+					_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
															
 
																+					HASH_DEL(sent_data[n], already_sent);
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+		else {
															
 
																+			int mpi_rank = starpu_data_get_rank(data);
															
 
																+			struct _starpu_data_entry *already_received;
															
 
																+			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
															
 
																+			if (already_received) {
															
 
																+				/* Somebody else will write to the data, so discard our cached copy if any */
															
 
																+				/* TODO: starpu_mpi could just remember itself. */
															
 
																+				_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
															
 
																+				HASH_DEL(received_data[mpi_rank], already_received);
															
 
																+				starpu_data_invalidate_submit(data);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+#else
															
 
																+	/* We allocated a temporary buffer for the received data, now drop it */
															
 
																+	if ((mode & STARPU_R) && do_execute) {
															
 
																+		int mpi_rank = starpu_data_get_rank(data);
															
 
																+		if (mpi_rank != me && mpi_rank != -1) {
															
 
																+			starpu_data_invalidate_submit(data);
															
 
																+		}
															
 
																+	}
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
															
 
																+{
															
 
																+	int arg_type;
															
 
																+	va_list varg_list;
															
 
																+	int me, do_execute, xrank, nb_nodes;
															
 
																+	size_t *size_on_nodes;
															
 
																+	size_t arg_buffer_size = 0;
															
 
																+	char *arg_buffer;
															
 
																+	int dest=0, inconsistent_execute;
															
 
																+	int current_data = 0;
															
 
																+
															
 
																+	_STARPU_MPI_LOG_IN();
															
 
																+
															
 
																+	MPI_Comm_rank(comm, &me);
															
 
																+	MPI_Comm_size(comm, &nb_nodes);
															
 
																+
															
 
																+	size_on_nodes = (size_t *)calloc(1, nb_nodes * sizeof(size_t));
															
 
																+
															
 
																+	_starpu_mpi_tables_init();
															
 
																+
															
 
																+	/* Get the number of buffers and the size of the arguments */
															
 
																+	va_start(varg_list, codelet);
															
 
																+	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
															
 
																+
															
 
																+	va_start(varg_list, codelet);
															
 
																+	_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
															
 
																+
															
 
																+	/* Find out whether we are to execute the data because we own the data to be written to. */
															
 
																+	inconsistent_execute = 0;
															
 
																+	do_execute = -1;
															
 
																+	xrank = -1;
															
 
																+	va_start(varg_list, codelet);
															
 
																+	while ((arg_type = va_arg(varg_list, int)) != 0) {
															
 
																+		if (arg_type==STARPU_EXECUTE_ON_NODE) {
															
 
																+			xrank = va_arg(varg_list, int);
															
 
																+			_STARPU_MPI_DEBUG("Executing on node %d\n", xrank);
															
 
																+			do_execute = 1;
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
															
 
																+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
															
 
																+			xrank = starpu_data_get_rank(data);
															
 
																+			_STARPU_MPI_DEBUG("Executing on data node %d\n", xrank);
															
 
																+			STARPU_ASSERT(xrank <= nb_nodes);
															
 
																+			do_execute = 1;
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
															
 
																+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
															
 
																+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
															
 
																+			int ret = _starpu_mpi_find_executee_node(data, mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
															
 
																+			if (ret == -EINVAL)
															
 
																+			{
															
 
																+				free(size_on_nodes);
															
 
																+				return ret;
															
 
																+			}
															
 
																+			current_data ++;
															
 
																+		}
															
 
																+		else if (arg_type == STARPU_DATA_ARRAY)
															
 
																+		{
															
 
																+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
															
 
																+			int nb_handles = va_arg(varg_list, int);
															
 
																+			int i;
															
 
																+			for(i=0 ; i<nb_handles ; i++)
															
 
																+			{
															
 
																+				enum starpu_access_mode mode = codelet->modes[current_data];
															
 
																+				int ret = _starpu_mpi_find_executee_node(datas[i], mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
															
 
																+				if (ret == -EINVAL)
															
 
																+				{
															
 
																+					free(size_on_nodes);
															
 
																+					return ret;
															
 
																+				}
															
 
																+				current_data ++;
															
 
																+			}
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_VALUE) {
															
 
																+			va_arg(varg_list, void *);
															
 
																+			va_arg(varg_list, size_t);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK) {
															
 
																+			va_arg(varg_list, void (*)(void *));
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
															
 
																+			va_arg(varg_list, void (*)(void *));
															
 
																+			va_arg(varg_list, void *);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK_ARG) {
															
 
																+			va_arg(varg_list, void *);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_PRIORITY) {
															
 
																+			va_arg(varg_list, int);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
															
 
																+			va_arg(varg_list, int);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
															
 
																+			va_arg(varg_list, int);
															
 
																+		}
															
 
																+	}
															
 
																+	va_end(varg_list);
															
 
																+
															
 
																+	if (do_execute == -1) {
															
 
																+		int i;
															
 
																+		size_t max_size = 0;
															
 
																+		for(i=0 ; i<nb_nodes ; i++) {
															
 
																+			if (size_on_nodes[i] > max_size)
															
 
																+			{
															
 
																+				max_size = size_on_nodes[i];
															
 
																+				xrank = i;
															
 
																+			}
															
 
																+		}
															
 
																+		free(size_on_nodes);
															
 
																+		if (xrank != -1) {
															
 
																+			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
															
 
																+			do_execute = 1;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	STARPU_ASSERT_MSG(do_execute != -1, "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
															
 
																+
															
 
																+	if (inconsistent_execute == 1) {
															
 
																+		if (xrank == -1) {
															
 
																+			_STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
															
 
																+			free(size_on_nodes);
															
 
																+			return -EINVAL;
															
 
																+		}
															
 
																+		else {
															
 
																+			do_execute = (me == xrank);
															
 
																+			dest = xrank;
															
 
																+		}
															
 
																+	}
															
 
																+	else if (xrank != -1) {
															
 
																+		do_execute = (me == xrank);
															
 
																+		dest = xrank;
															
 
																+	}
															
 
																+
															
 
																+	/* Send and receive data as requested */
															
 
																+	va_start(varg_list, codelet);
															
 
																+	current_data = 0;
															
 
																+	while ((arg_type = va_arg(varg_list, int)) != 0) {
															
 
																+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
															
 
																+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
															
 
																+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
															
 
																+
															
 
																+			_starpu_mpi_exchange_data_before_execution(data, mode, me, dest, do_execute, comm);
															
 
																+			current_data ++;
															
 
																+
															
 
																+		}
															
 
																+		else if (arg_type == STARPU_DATA_ARRAY)
															
 
																+		{
															
 
																+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
															
 
																+			int nb_handles = va_arg(varg_list, int);
															
 
																+			int i;
															
 
																+
															
 
																+			for(i=0 ; i<nb_handles ; i++)
															
 
																+			{
															
 
																+				_starpu_mpi_exchange_data_before_execution(datas[i], codelet->modes[current_data], me, dest, do_execute, comm);
															
 
																+				current_data++;
															
 
																+			}
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_VALUE) {
															
 
																+			va_arg(varg_list, void *);
															
 
																+			va_arg(varg_list, size_t);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK) {
															
 
																+			va_arg(varg_list, void (*)(void *));
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
															
 
																+			va_arg(varg_list, void (*)(void *));
															
 
																+			va_arg(varg_list, void *);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK_ARG) {
															
 
																+			va_arg(varg_list, void *);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_PRIORITY) {
															
 
																+			va_arg(varg_list, int);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
															
 
																+			va_arg(varg_list, int);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
															
 
																+			va_arg(varg_list, starpu_data_handle_t);
															
 
																+		}
															
 
																+	}
															
 
																+	va_end(varg_list);
															
 
																+
															
 
																+	if (do_execute) {
															
 
																+		_STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
															
 
																+		va_start(varg_list, codelet);
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+		int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
															
 
																+		_STARPU_MPI_DEBUG("ret: %d\n", ret);
															
 
																+		STARPU_ASSERT(ret==0);
															
 
																+	}
															
 
																+
															
 
																+	if (inconsistent_execute) {
															
 
																+		va_start(varg_list, codelet);
															
 
																+		current_data = 0;
															
 
																+		while ((arg_type = va_arg(varg_list, int)) != 0) {
															
 
																+			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
															
 
																+				starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
															
 
																+				enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
															
 
																+
															
 
																+				_starpu_mpi_exchange_data_after_execution(data, mode, me, xrank, dest, do_execute, comm);
															
 
																+				current_data++;
															
 
																+			}
															
 
																+			else if (arg_type == STARPU_DATA_ARRAY)
															
 
																+			{
															
 
																+				starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
															
 
																+				int nb_handles = va_arg(varg_list, int);
															
 
																+				int i;
															
 
																+
															
 
																+				for(i=0 ; i<nb_handles ; i++)
															
 
																+				{
															
 
																+					_starpu_mpi_exchange_data_after_execution(datas[i], codelet->modes[current_data], me, xrank, dest, do_execute, comm);
															
 
																+					current_data++;
															
 
																+				}
															
 
																+			}
															
 
																+			else if (arg_type==STARPU_VALUE) {
															
 
																+				va_arg(varg_list, void *);
															
 
																+				va_arg(varg_list, size_t);
															
 
																+			}
															
 
																+			else if (arg_type==STARPU_CALLBACK) {
															
 
																+				va_arg(varg_list, void (*)(void *));
															
 
																+			}
															
 
																+			else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
															
 
																+				va_arg(varg_list, void (*)(void *));
															
 
																+				va_arg(varg_list, void *);
															
 
																+			}
															
 
																+			else if (arg_type==STARPU_CALLBACK_ARG) {
															
 
																+				va_arg(varg_list, void *);
															
 
																+			}
															
 
																+			else if (arg_type==STARPU_PRIORITY) {
															
 
																+				va_arg(varg_list, int);
															
 
																+			}
															
 
																+			else if (arg_type==STARPU_EXECUTE_ON_NODE) {
															
 
																+				va_arg(varg_list, int);
															
 
																+			}
															
 
																+			else if (arg_type==STARPU_EXECUTE_ON_DATA) {
															
 
																+				va_arg(varg_list, starpu_data_handle_t);
															
 
																+			}
															
 
																+		}
															
 
																+		va_end(varg_list);
															
 
																+	}
															
 
																+
															
 
																+	va_start(varg_list, codelet);
															
 
																+	current_data = 0;
															
 
																+	while ((arg_type = va_arg(varg_list, int)) != 0) {
															
 
																+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH || arg_type == STARPU_REDUX) {
															
 
																+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
															
 
																+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
															
 
																+
															
 
																+			_starpu_mpi_clear_data_after_execution(data, mode, me, do_execute, comm);
															
 
																+			current_data++;
															
 
																+		}
															
 
																+		else if (arg_type == STARPU_DATA_ARRAY)
															
 
																+		{
															
 
																+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
															
 
																+			int nb_handles = va_arg(varg_list, int);
															
 
																+			int i;
															
 
																+
															
 
																+			for(i=0 ; i<nb_handles ; i++)
															
 
																+			{
															
 
																+				_starpu_mpi_clear_data_after_execution(datas[i], codelet->modes[current_data], me, do_execute, comm);
															
 
																+				current_data++;
															
 
																+			}
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_VALUE) {
															
 
																+			va_arg(varg_list, void *);
															
 
																+			va_arg(varg_list, size_t);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK) {
															
 
																+			va_arg(varg_list, void (*)(void *));
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
															
 
																+			va_arg(varg_list, void (*)(void *));
															
 
																+			va_arg(varg_list, void *);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_CALLBACK_ARG) {
															
 
																+			va_arg(varg_list, void *);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_PRIORITY) {
															
 
																+			va_arg(varg_list, int);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
															
 
																+			va_arg(varg_list, int);
															
 
																+		}
															
 
																+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
															
 
																+			va_arg(varg_list, starpu_data_handle_t);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	va_end(varg_list);
															
 
																+	_STARPU_MPI_LOG_OUT();
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
															
 
																+{
															
 
																+	int me, rank, tag;
															
 
																+
															
 
																+	rank = starpu_data_get_rank(data_handle);
															
 
																+	tag = starpu_data_get_tag(data_handle);
															
 
																+	if(rank == -1) {
															
 
																+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+	if(tag == -1) {
															
 
																+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+	MPI_Comm_rank(comm, &me);
															
 
																+
															
 
																+	if (node == rank) return;
															
 
																+
															
 
																+	if (me == node)
															
 
																+	{
															
 
																+		starpu_mpi_irecv_detached(data_handle, rank, tag, comm, callback, arg);
															
 
																+	}
															
 
																+	else if (me == rank)
															
 
																+	{
															
 
																+		starpu_mpi_isend_detached(data_handle, node, tag, comm, NULL, NULL);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
															
 
																+{
															
 
																+	int me, rank, tag;
															
 
																+
															
 
																+	rank = starpu_data_get_rank(data_handle);
															
 
																+	tag = starpu_data_get_tag(data_handle);
															
 
																+	if(rank == -1) {
															
 
																+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+	if(tag == -1) {
															
 
																+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+	MPI_Comm_rank(comm, &me);
															
 
																+
															
 
																+	if (node == rank) return;
															
 
																+
															
 
																+	if (me == node)
															
 
																+	{
															
 
																+		MPI_Status status;
															
 
																+		starpu_mpi_recv(data_handle, rank, tag, comm, &status);
															
 
																+	}
															
 
																+	else if (me == rank)
															
 
																+	{
															
 
																+		starpu_mpi_send(data_handle, node, tag, comm);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
															
 
																+{
															
 
																+	int me, rank, tag, nb_nodes;
															
 
																+
															
 
																+	rank = starpu_data_get_rank(data_handle);
															
 
																+	tag = starpu_data_get_tag(data_handle);
															
 
																+	if(rank == -1) {
															
 
																+		fprintf(stderr,"StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank\n");
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+	if(tag == -1) {
															
 
																+		fprintf(stderr,"StarPU needs to be told the MPI tag of this data, using starpu_data_set_tag\n");
															
 
																+		STARPU_ABORT();
															
 
																+	}
															
 
																+
															
 
																+	MPI_Comm_rank(comm, &me);
															
 
																+	MPI_Comm_size(comm, &nb_nodes);
															
 
																+
															
 
																+	_STARPU_MPI_DEBUG("Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
															
 
																+
															
 
																+	// need to count how many nodes have the data in redux mode
															
 
																+	if (me == rank) {
															
 
																+		int i;
															
 
																+
															
 
																+		for(i=0 ; i<nb_nodes ; i++) {
															
 
																+			if (i != rank) {
															
 
																+				starpu_data_handle_t new_handle;
															
 
																+
															
 
																+				starpu_data_register_same(&new_handle, data_handle);
															
 
																+
															
 
																+				_STARPU_MPI_DEBUG("Receiving redux handle from %d in %p ...\n", i, new_handle);
															
 
																+
															
 
																+				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
															
 
																+				starpu_insert_task(data_handle->redux_cl,
															
 
																+						STARPU_RW, data_handle,
															
 
																+						STARPU_R, new_handle,
															
 
																+						0);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	else {
															
 
																+		_STARPU_MPI_DEBUG("Sending redux handle to %d ...\n", rank);
															
 
																+		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
															
 
																+	}
															
 
																+}
															
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -0,0 +1,99 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_MPI_PRIVATE_H__
															
 
																+#define __STARPU_MPI_PRIVATE_H__
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <common/config.h>
															
 
																+#include "starpu_mpi.h"
															
 
																+#include "starpu_mpi_fxt.h"
															
 
																+#include <common/list.h>
															
 
																+#include <common/utils.h>
															
 
																+#include <pthread.h>
															
 
																+
															
 
																+//#define STARPU_MPI_VERBOSE	1
															
 
																+
															
 
																+#ifdef STARPU_MPI_VERBOSE
															
 
																+#  define _STARPU_MPI_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
															
 
																+    						int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);       \
															
 
																+                                                fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_debug_rank+1)*4, "", _debug_rank, __func__ ,##args); \
															
 
																+                                                fflush(stderr); }} while(0);
															
 
																+#else
															
 
																+#  define _STARPU_MPI_DEBUG(fmt, args ...)
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_MPI_VERBOSE0
															
 
																+#  define _STARPU_MPI_LOG_IN()             do { if (!getenv("STARPU_SILENT")) { \
															
 
																+                                               int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
															
 
																+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] -->\n", (_debug_rank+1)*4, "", _debug_rank, __func__ ); \
															
 
																+                                               fflush(stderr); }} while(0)
															
 
																+#  define _STARPU_MPI_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) { \
															
 
																+                                               int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
															
 
																+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] <--\n", (_debug_rank+1)*4, "", _debug_rank, __func__ ); \
															
 
																+                                               fflush(stderr); }} while(0)
															
 
																+#else
															
 
																+#  define _STARPU_MPI_LOG_IN()
															
 
																+#  define _STARPU_MPI_LOG_OUT()
															
 
																+#endif
															
 
																+
															
 
																+#define SEND_REQ	0
															
 
																+#define RECV_REQ	1
															
 
																+#define WAIT_REQ        2
															
 
																+#define TEST_REQ        3
															
 
																+#define BARRIER_REQ     4
															
 
																+
															
 
																+LIST_TYPE(_starpu_mpi_req,
															
 
																+	/* description of the data at StarPU level */
															
 
																+	starpu_data_handle_t data_handle;
															
 
																+
															
 
																+	/* description of the data to be sent/received */
															
 
																+	MPI_Datatype datatype;
															
 
																+	void *ptr;
															
 
																+	int needs_unpacking;
															
 
																+
															
 
																+	/* who are we talking to ? */
															
 
																+	int srcdst;
															
 
																+	int mpi_tag;
															
 
																+	MPI_Comm comm;
															
 
																+
															
 
																+	void (*func)(struct _starpu_mpi_req *);
															
 
																+
															
 
																+	MPI_Status *status;
															
 
																+	MPI_Request request;
															
 
																+	int *flag;
															
 
																+
															
 
																+	int ret;
															
 
																+	pthread_mutex_t req_mutex;
															
 
																+	pthread_cond_t req_cond;
															
 
																+
															
 
																+	unsigned request_type; /* 0 send, 1 recv */
															
 
																+
															
 
																+	unsigned submitted;
															
 
																+	unsigned completed;
															
 
																+
															
 
																+	/* In the case of a Wait/Test request, we are going to post a request
															
 
																+	 * to test the completion of another request */
															
 
																+	struct _starpu_mpi_req *other_request;
															
 
																+
															
 
																+	/* in the case of detached requests */
															
 
																+	unsigned detached;
															
 
																+	void *callback_arg;
															
 
																+	void (*callback)(void *);
															
 
																+);
															
 
																+
															
 
																+#endif // __STARPU_MPI_PRIVATE_H__
															
--- a/mpi/src/starpu_mpi_stats.c
+++ b/mpi/src/starpu_mpi_stats.c
@@ -0,0 +1,88 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi_stats.h>
															
 
																+#include <common/config.h>
															
 
																+#include <stdio.h>
															
 
																+//#define STARPU_MPI_VERBOSE	1
															
 
																+#include <starpu_mpi_private.h>
															
 
																+
															
 
																+/* measure the amount of data transfers between each pair of MPI nodes */
															
 
																+#ifdef STARPU_COMM_STATS
															
 
																+static size_t *comm_amount;
															
 
																+static int world_size;
															
 
																+#endif /* STARPU_COMM_STATS */
															
 
																+
															
 
																+void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
															
 
																+{
															
 
																+#ifdef STARPU_COMM_STATS
															
 
																+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-comm-stats, which slows down a bit\n");
															
 
																+
															
 
																+	MPI_Comm_size(comm, &world_size);
															
 
																+	_STARPU_MPI_DEBUG("allocating for %d nodes\n", world_size);
															
 
																+
															
 
																+	comm_amount = (size_t *) calloc(world_size, sizeof(size_t));
															
 
																+#endif /* STARPU_COMM_STATS */
															
 
																+}
															
 
																+
															
 
																+void _starpu_mpi_comm_amounts_free()
															
 
																+{
															
 
																+#ifdef STARPU_COMM_STATS
															
 
																+	free(comm_amount);
															
 
																+#endif /* STARPU_COMM_STATS */
															
 
																+}
															
 
																+
															
 
																+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm  __attribute__ ((unused)),
															
 
																+				  unsigned dst  __attribute__ ((unused)),
															
 
																+				  MPI_Datatype datatype  __attribute__ ((unused)),
															
 
																+				  int count __attribute__ ((unused)))
															
 
																+{
															
 
																+#ifdef STARPU_COMM_STATS
															
 
																+	int src, size;
															
 
																+
															
 
																+	MPI_Comm_rank(comm, &src);
															
 
																+	MPI_Type_size(datatype, &size);
															
 
																+
															
 
																+	_STARPU_MPI_DEBUG("[%d] adding %d to %d\n", src, count*size, dst);
															
 
																+
															
 
																+	comm_amount[dst] += count*size;
															
 
																+#endif /* STARPU_COMM_STATS */
															
 
																+}
															
 
																+
															
 
																+void _starpu_mpi_comm_amounts_display(int node)
															
 
																+{
															
 
																+#ifdef STARPU_COMM_STATS
															
 
																+	unsigned dst;
															
 
																+	size_t sum = 0;
															
 
																+
															
 
																+	for (dst = 0; dst < world_size; dst++)
															
 
																+	{
															
 
																+		sum += comm_amount[dst];
															
 
																+	}
															
 
																+
															
 
																+	fprintf(stderr, "\n[%d] Communication transfers stats:\nTOTAL transfers %f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
															
 
																+
															
 
																+	for (dst = 0; dst < world_size; dst++)
															
 
																+	{
															
 
																+		if (comm_amount[dst])
															
 
																+		{
															
 
																+			fprintf(stderr, "\t%d -> %d\t%f B\t%f MB\n",
															
 
																+				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
															
 
																+		}
															
 
																+	}
															
 
																+#endif /* STARPU_COMM_STATS */
															
 
																+}
															
 
																+
															
--- a/mpi/src/starpu_mpi_stats.h
+++ b/mpi/src/starpu_mpi_stats.h
@@ -0,0 +1,24 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <stdlib.h>
															
 
																+#include <mpi.h>
															
 
																+
															
 
																+void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
															
 
																+void _starpu_mpi_comm_amounts_free();
															
 
																+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count);
															
 
																+void _starpu_mpi_comm_amounts_display(int node);
															
 
																+
															
--- a/mpi/starpumpi-1.0.pc.in
+++ b/mpi/starpumpi-1.0.pc.in
@@ -0,0 +1,29 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+prefix=@prefix@
															
 
																+exec_prefix=@exec_prefix@
															
 
																+libdir=@libdir@
															
 
																+includedir=@includedir@
															
 
																+
															
 
																+Name: starpumpi
															
 
																+Description: offers MPI support for heterogeneous multicore architecture
															
 
																+Version: @PACKAGE_VERSION@
															
 
																+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
															
 
																+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
															
 
																+Libs.private: @LDFLAGS@ @LIBS@
															
 
																+Requires: starpu-1.0
															
 
																+Requires.private:
															
--- a/mpi/tests/.gitignore
+++ b/mpi/tests/.gitignore
@@ -0,0 +1 @@
 
																+/.deps
															
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -0,0 +1,153 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+CC=$(MPICC)
															
 
																+CCLD=$(MPICC)
															
 
																+
															
 
																+if STARPU_MPI_CHECK
															
 
																+if STARPU_HAVE_AM111
															
 
																+LOG_COMPILER	 	=	$(MPIEXEC) -np 2
															
 
																+else
															
 
																+TESTS_ENVIRONMENT 	=	$(MPIEXEC) -np 2
															
 
																+endif
															
 
																+TESTS			=	$(check_PROGRAMS)
															
 
																+endif
															
 
																+
															
 
																+check_PROGRAMS =
															
 
																+
															
 
																+BUILT_SOURCES =
															
 
																+
															
 
																+CLEANFILES = *.gcno *.gcda *.linkinfo
															
 
																+
															
 
																+examplebindir = $(libdir)/starpu/examples/mpi
															
 
																+
															
 
																+examplebin_PROGRAMS =
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+# TODO define NVCCFLAGS
															
 
																+NVCC ?= nvcc
															
 
																+
															
 
																+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
															
 
																+
															
 
																+.cu.cubin:
															
 
																+	$(MKDIR_P) `dirname $@`
															
 
																+	$(NVCC) -cubin $< -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
															
 
																+
															
 
																+.cu.o:
															
 
																+	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I$(top_srcdir)/include/  -I$(top_builddir)/include/
															
 
																+endif
															
 
																+
															
 
																+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS)
															
 
																+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/src -I$(top_builddir)/src
															
 
																+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
															
 
																+
															
 
																+########################
															
 
																+# Unit testcases       #
															
 
																+########################
															
 
																+
															
 
																+check_PROGRAMS +=				\
															
 
																+	pingpong				\
															
 
																+	mpi_test				\
															
 
																+	mpi_isend				\
															
 
																+	mpi_irecv				\
															
 
																+	mpi_isend_detached			\
															
 
																+	mpi_irecv_detached			\
															
 
																+	mpi_detached_tag			\
															
 
																+	ring					\
															
 
																+	ring_async				\
															
 
																+	ring_async_implicit			\
															
 
																+	block_interface				\
															
 
																+	block_interface_pinned			\
															
 
																+	insert_task				\
															
 
																+	insert_task_cache			\
															
 
																+	insert_task_block			\
															
 
																+	insert_task_owner			\
															
 
																+	insert_task_owner2			\
															
 
																+	insert_task_owner_data			\
															
 
																+	multiple_send
															
 
																+
															
 
																+noinst_PROGRAMS =				\
															
 
																+	pingpong				\
															
 
																+	mpi_test				\
															
 
																+	mpi_isend				\
															
 
																+	mpi_irecv				\
															
 
																+	mpi_isend_detached			\
															
 
																+	mpi_irecv_detached			\
															
 
																+	mpi_detached_tag			\
															
 
																+	ring					\
															
 
																+	ring_async				\
															
 
																+	ring_async_implicit			\
															
 
																+	block_interface				\
															
 
																+	block_interface_pinned			\
															
 
																+	insert_task				\
															
 
																+	insert_task_cache			\
															
 
																+	insert_task_block			\
															
 
																+	insert_task_owner			\
															
 
																+	insert_task_owner2			\
															
 
																+	insert_task_owner_data			\
															
 
																+	multiple_send
															
 
																+
															
 
																+mpi_isend_LDADD =					\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+mpi_irecv_LDADD =					\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+mpi_isend_detached_LDADD =			\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+mpi_irecv_detached_LDADD =			\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+mpi_detached_tag_LDADD =				\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+pingpong_LDADD =					\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+mpi_test_LDADD =					\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+ring_LDADD =					\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+ring_async_LDADD =				\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+ring_async_implicit_LDADD =			\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+block_interface_LDADD =				\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+block_interface_pinned_LDADD =			\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+insert_task_LDADD =				\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+insert_task_cache_LDADD =				\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+insert_task_block_LDADD =				\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+insert_task_owner_LDADD =				\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+insert_task_owner2_LDADD =			\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+insert_task_owner_data_LDADD =			\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+multiple_send_LDADD =				\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+
															
 
																+ring_SOURCES = ring.c
															
 
																+ring_async_SOURCES = ring_async.c
															
 
																+ring_async_implicit_SOURCES = ring_async_implicit.c
															
 
																+if STARPU_USE_CUDA
															
 
																+ring_SOURCES += ring_kernel.cu
															
 
																+ring_async_SOURCES += ring_kernel.cu
															
 
																+ring_async_implicit_SOURCES += ring_kernel.cu
															
 
																+endif
															
 
																+
															
 
																+showcheck:
															
 
																+	-cat $(TEST_LOGS) /dev/null
															
--- a/mpi/tests/block_interface.c
+++ b/mpi/tests/block_interface.c
@@ -0,0 +1,148 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <stdlib.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+
															
 
																+#define BIGSIZE	128
															
 
																+#define SIZE	64
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size < 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need at least processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	/* We only use 2 nodes for that test */
															
 
																+	if (rank >= 2)
															
 
																+	{
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	/* Node 0 will allocate a big block and only register an inner part of
															
 
																+	 * it as the block data, Node 1 will allocate a block of small size and
															
 
																+	 * register it directly. Node 0 and 1 will then exchange the content of
															
 
																+	 * their blocks. */
															
 
																+
															
 
																+	float *block;
															
 
																+	starpu_data_handle_t block_handle;
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		block = calloc(BIGSIZE*BIGSIZE*BIGSIZE, sizeof(float));
															
 
																+		assert(block);
															
 
																+
															
 
																+		/* fill the inner block */
															
 
																+		unsigned i, j, k;
															
 
																+		for (k = 0; k < SIZE; k++)
															
 
																+		for (j = 0; j < SIZE; j++)
															
 
																+		for (i = 0; i < SIZE; i++)
															
 
																+		{
															
 
																+			block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] = 1.0f;
															
 
																+		}
															
 
																+
															
 
																+		starpu_block_data_register(&block_handle, 0,
															
 
																+			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
															
 
																+			SIZE, SIZE, SIZE, sizeof(float));
															
 
																+	}
															
 
																+	else /* rank == 1 */
															
 
																+	{
															
 
																+		block = calloc(SIZE*SIZE*SIZE, sizeof(float));
															
 
																+		assert(block);
															
 
																+
															
 
																+		starpu_block_data_register(&block_handle, 0,
															
 
																+			(uintptr_t)block, SIZE, SIZE*SIZE,
															
 
																+			SIZE, SIZE, SIZE, sizeof(float));
															
 
																+	}
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
															
 
																+
															
 
																+		MPI_Status status;
															
 
																+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
															
 
																+
															
 
																+		/* check the content of the block */
															
 
																+		ret = starpu_data_acquire(block_handle, STARPU_R);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
															
 
																+
															
 
																+		unsigned i, j, k;
															
 
																+		for (k = 0; k < SIZE; k++)
															
 
																+		for (j = 0; j < SIZE; j++)
															
 
																+		for (i = 0; i < SIZE; i++)
															
 
																+		{
															
 
																+			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
															
 
																+		}
															
 
																+		starpu_data_release(block_handle);
															
 
																+
															
 
																+	}
															
 
																+	else /* rank == 1 */
															
 
																+	{
															
 
																+		MPI_Status status;
															
 
																+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
															
 
																+
															
 
																+		/* check the content of the block and modify it */
															
 
																+		ret = starpu_data_acquire(block_handle, STARPU_RW);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
															
 
																+
															
 
																+		unsigned i, j, k;
															
 
																+		for (k = 0; k < SIZE; k++)
															
 
																+		for (j = 0; j < SIZE; j++)
															
 
																+		for (i = 0; i < SIZE; i++)
															
 
																+		{
															
 
																+			assert(block[i + j*SIZE + k*SIZE*SIZE] == 1.0f);
															
 
																+			block[i + j*SIZE + k*SIZE*SIZE] = 33.0f;
															
 
																+		}
															
 
																+		starpu_data_release(block_handle);
															
 
																+
															
 
																+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
															
 
																+	}
															
 
																+
															
 
																+	FPRINTF(stdout, "Rank %d is done\n", rank);
															
 
																+	fflush(stdout);
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/block_interface_pinned.c
+++ b/mpi/tests/block_interface_pinned.c
@@ -0,0 +1,151 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <stdlib.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+
															
 
																+#define BIGSIZE	64
															
 
																+#define SIZE	64
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size < 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need at least processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	/* We only use 2 nodes for that test */
															
 
																+	if (rank >= 2)
															
 
																+	{
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	/* Node 0 will allocate a big block and only register an inner part of
															
 
																+	 * it as the block data, Node 1 will allocate a block of small size and
															
 
																+	 * register it directly. Node 0 and 1 will then exchange the content of
															
 
																+	 * their blocks. */
															
 
																+
															
 
																+	float *block;
															
 
																+	starpu_data_handle_t block_handle;
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		starpu_malloc((void **)&block,
															
 
																+				BIGSIZE*BIGSIZE*BIGSIZE*sizeof(float));
															
 
																+		memset(block, 0, BIGSIZE*BIGSIZE*BIGSIZE*sizeof(float));
															
 
																+
															
 
																+		/* fill the inner block */
															
 
																+		unsigned i, j, k;
															
 
																+		for (k = 0; k < SIZE; k++)
															
 
																+		for (j = 0; j < SIZE; j++)
															
 
																+		for (i = 0; i < SIZE; i++)
															
 
																+		{
															
 
																+			block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] = 1.0f;
															
 
																+		}
															
 
																+
															
 
																+		starpu_block_data_register(&block_handle, 0,
															
 
																+			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
															
 
																+			SIZE, SIZE, SIZE, sizeof(float));
															
 
																+	}
															
 
																+	else /* rank == 1 */
															
 
																+	{
															
 
																+		starpu_malloc((void **)&block,
															
 
																+			SIZE*SIZE*SIZE*sizeof(float));
															
 
																+		memset(block, 0, SIZE*SIZE*SIZE*sizeof(float));
															
 
																+
															
 
																+		starpu_block_data_register(&block_handle, 0,
															
 
																+			(uintptr_t)block, SIZE, SIZE*SIZE,
															
 
																+			SIZE, SIZE, SIZE, sizeof(float));
															
 
																+	}
															
 
																+
															
 
																+	if (rank == 0)
															
 
																+	{
															
 
																+		MPI_Status status;
															
 
																+
															
 
																+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
															
 
																+
															
 
																+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
															
 
																+
															
 
																+		/* check the content of the block */
															
 
																+		starpu_data_acquire(block_handle, STARPU_R);
															
 
																+		unsigned i, j, k;
															
 
																+		for (k = 0; k < SIZE; k++)
															
 
																+		for (j = 0; j < SIZE; j++)
															
 
																+		for (i = 0; i < SIZE; i++)
															
 
																+		{
															
 
																+			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
															
 
																+		}
															
 
																+		starpu_data_release(block_handle);
															
 
																+
															
 
																+	}
															
 
																+	else /* rank == 1 */
															
 
																+	{
															
 
																+		MPI_Status status;
															
 
																+
															
 
																+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
															
 
																+
															
 
																+		/* check the content of the block and modify it */
															
 
																+		ret = starpu_data_acquire(block_handle, STARPU_RW);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
															
 
																+
															
 
																+		unsigned i, j, k;
															
 
																+		for (k = 0; k < SIZE; k++)
															
 
																+		for (j = 0; j < SIZE; j++)
															
 
																+		for (i = 0; i < SIZE; i++)
															
 
																+		{
															
 
																+			assert(block[i + j*SIZE + k*SIZE*SIZE] == 1.0f);
															
 
																+			block[i + j*SIZE + k*SIZE*SIZE] = 33.0f;
															
 
																+		}
															
 
																+		starpu_data_release(block_handle);
															
 
																+
															
 
																+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
															
 
																+
															
 
																+	}
															
 
																+
															
 
																+	FPRINTF(stdout, "Rank %d is done\n", rank);
															
 
																+	fflush(stdout);
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/helper.h
+++ b/mpi/tests/helper.h
@@ -0,0 +1,22 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <errno.h>
															
 
																+
															
 
																+#define STARPU_TEST_SKIPPED 77
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
--- a/mpi/tests/insert_task.c
+++ b/mpi/tests/insert_task.c
@@ -0,0 +1,143 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <math.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+
															
 
																+        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
															
 
																+        *x = (*x + *y) / 2;
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet mycodelet =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_RW, STARPU_R}
															
 
																+};
															
 
																+
															
 
																+#define X     4
															
 
																+#define Y     5
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int y, int nb_nodes)
															
 
																+{
															
 
																+        return x % nb_nodes;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int rank, size, x, y;
															
 
																+        int value=0, ret;
															
 
																+        unsigned matrix[X][Y];
															
 
																+        starpu_data_handle_t data_handles[X][Y];
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize_extended(&rank, &size);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																+
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        matrix[x][y] = (rank+1)*10 + value;
															
 
																+                        value++;
															
 
																+                }
															
 
																+        }
															
 
																+#if 0
															
 
																+        for(x = 0; x < X; x++) {
															
 
																+                FPRINTF(stdout, "[%d] ", rank);
															
 
																+                for (y = 0; y < Y; y++) {
															
 
																+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
															
 
																+                }
															
 
																+                FPRINTF(stdout, "\n");
															
 
																+        }
															
 
																+#endif
															
 
																+
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        int mpi_rank = my_distrib(x, y, size);
															
 
																+                        if (mpi_rank == rank)
															
 
																+			{
															
 
																+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
															
 
																+                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
															
 
																+                        }
															
 
																+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
															
 
																+			{
															
 
																+                                /* I don't own that index, but will need it for my computations */
															
 
																+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
															
 
																+                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
															
 
																+                        }
															
 
																+                        else
															
 
																+			{
															
 
																+                                /* I know it's useless to allocate anything for this */
															
 
																+                                data_handles[x][y] = NULL;
															
 
																+                        }
															
 
																+                        if (data_handles[x][y])
															
 
																+			{
															
 
																+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																+                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
															
 
																+			}
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+
															
 
																+        FPRINTF(stderr, "Waiting ...\n");
															
 
																+        starpu_task_wait_for_all();
															
 
																+
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        if (data_handles[x][y])
															
 
																+                                starpu_data_unregister(data_handles[x][y]);
															
 
																+                }
															
 
																+        }
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+#if 0
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                FPRINTF(stdout, "[%d] ", rank);
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
															
 
																+                }
															
 
																+                FPRINTF(stdout, "\n");
															
 
																+        }
															
 
																+#endif
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/insert_task_block.c
+++ b/mpi/tests/insert_task_block.c
@@ -0,0 +1,165 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <math.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	unsigned *matrix = (unsigned *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	int nx = (int)STARPU_MATRIX_GET_NX(descr[0]);
															
 
																+	int ny = (int)STARPU_MATRIX_GET_NY(descr[0]);
															
 
																+	int ld = (int)STARPU_MATRIX_GET_LD(descr[0]);
															
 
																+
															
 
																+        int i, j;
															
 
																+        unsigned sum=0;
															
 
																+
															
 
																+	for (i = 0; i < nx; i++)
															
 
																+	{
															
 
																+		for (j = 0; j < ny; j++)
															
 
																+		{
															
 
																+                        sum += matrix[i+j*ld];
															
 
																+                }
															
 
																+        }
															
 
																+	for (i = 0; i < nx; i++)
															
 
																+	{
															
 
																+		for (j = 0; j < ny; j++)
															
 
																+		{
															
 
																+                        matrix[i+j*ld] = sum;///(nx*ny);
															
 
																+                }
															
 
																+        }
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet mycodelet =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 1,
															
 
																+	.modes = {STARPU_RW}
															
 
																+};
															
 
																+
															
 
																+#define SIZE       6
															
 
																+#define BLOCKS     3
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int y, int nb_nodes)
															
 
																+{
															
 
																+        return x % nb_nodes;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int rank, size, x, y;
															
 
																+        int ret, value=0;
															
 
																+        unsigned matrix[SIZE*SIZE];
															
 
																+        starpu_data_handle_t data_handles[SIZE][SIZE];
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize_extended(&rank, &size);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																+
															
 
																+        for(x = 0; x < SIZE; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < SIZE; y++)
															
 
																+		{
															
 
																+                        matrix[x+y*SIZE] = rank*100 + value;
															
 
																+                        value++;
															
 
																+                }
															
 
																+        }
															
 
																+#if 1
															
 
																+        for(x = 0; x < SIZE; x++) {
															
 
																+                FPRINTF(stdout, "[%d] ", rank);
															
 
																+                for (y = 0; y < SIZE; y++) {
															
 
																+                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
															
 
																+                }
															
 
																+                FPRINTF(stdout, "\n");
															
 
																+        }
															
 
																+#endif
															
 
																+
															
 
																+        for(x = 0; x < BLOCKS ;  x++)
															
 
																+	{
															
 
																+                for (y = 0; y < BLOCKS; y++)
															
 
																+		{
															
 
																+                        int mpi_rank = my_distrib(x, y, size);
															
 
																+                        if (mpi_rank == rank)
															
 
																+			{
															
 
																+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
															
 
																+                                starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
															
 
																+                                                            SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
															
 
																+                        }
															
 
																+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
															
 
																+			{
															
 
																+                                /* I don't own that index, but will need it for my computations */
															
 
																+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
															
 
																+                                starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
															
 
																+                                                            SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
															
 
																+                        }
															
 
																+                        else
															
 
																+			{
															
 
																+                                /* I know it's useless to allocate anything for this */
															
 
																+                                data_handles[x][y] = NULL;
															
 
																+                        }
															
 
																+                        if (data_handles[x][y])
															
 
																+			{
															
 
																+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																+                                starpu_data_set_tag(data_handles[x][y], (y*BLOCKS)+x);
															
 
																+			}
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+        for(x = 0; x < BLOCKS; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < BLOCKS; y++)
															
 
																+		{
															
 
																+                        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																+						     STARPU_RW, data_handles[x][y],
															
 
																+						     0);
															
 
																+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+        FPRINTF(stderr, "Waiting ...\n");
															
 
																+        starpu_task_wait_for_all();
															
 
																+
															
 
																+        for(x = 0; x < BLOCKS; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < BLOCKS; y++)
															
 
																+		{
															
 
																+                        if (data_handles[x][y])
															
 
																+                                starpu_data_unregister(data_handles[x][y]);
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+#if 1
															
 
																+        for(x = 0; x < SIZE; x++)
															
 
																+	{
															
 
																+                FPRINTF(stdout, "[%d] ", rank);
															
 
																+                for (y = 0; y < SIZE; y++) {
															
 
																+                        FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
															
 
																+                }
															
 
																+                FPRINTF(stdout, "\n");
															
 
																+        }
															
 
																+#endif
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/insert_task_cache.c
+++ b/mpi/tests/insert_task_cache.c
@@ -0,0 +1,152 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <math.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+
															
 
																+        FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
															
 
																+        *x = (*x + *y) / 2;
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet mycodelet =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_RW, STARPU_R}
															
 
																+};
															
 
																+
															
 
																+#define X     4
															
 
																+#define Y     5
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int y, int nb_nodes)
															
 
																+{
															
 
																+        return x % nb_nodes;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int rank, size, x, y;
															
 
																+        int ret,value=0;
															
 
																+        unsigned matrix[X][Y];
															
 
																+        starpu_data_handle_t data_handles[X][Y];
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize_extended(&rank, &size);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																+
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        matrix[x][y] = (rank+1)*10 + value;
															
 
																+                        value++;
															
 
																+                }
															
 
																+        }
															
 
																+#if 0
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                FPRINTF(stdout, "[%d] ", rank);
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        FPRINTF(stdout, "%3u ", matrix[x][y]);
															
 
																+                }
															
 
																+                FPRINTF(stdout, "\n");
															
 
																+        }
															
 
																+#endif
															
 
																+
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        int mpi_rank = my_distrib(x, y, size);
															
 
																+                        if (mpi_rank == rank)
															
 
																+			{
															
 
																+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
															
 
																+                                starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
															
 
																+                        }
															
 
																+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
															
 
																+			{
															
 
																+                                /* I don't own that index, but will need it for my computations */
															
 
																+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
															
 
																+                                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
															
 
																+                        }
															
 
																+                        else
															
 
																+			{
															
 
																+                                /* I know it's useless to allocate anything for this */
															
 
																+                                data_handles[x][y] = NULL;
															
 
																+                        }
															
 
																+                        if (data_handles[x][y])
															
 
																+			{
															
 
																+                                starpu_data_set_rank(data_handles[x][y], mpi_rank);
															
 
																+                                starpu_data_set_tag(data_handles[x][y], (y*X)+x);
															
 
																+			}
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+	mycodelet.name = "codelet1";
															
 
																+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+
															
 
																+	mycodelet.name = "codelet2";
															
 
																+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+
															
 
																+	mycodelet.name = "codelet3";
															
 
																+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+
															
 
																+	mycodelet.name = "codelet4";
															
 
																+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
															
 
																+
															
 
																+        FPRINTF(stderr, "Waiting ...\n");
															
 
																+        starpu_task_wait_for_all();
															
 
																+
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        if (data_handles[x][y])
															
 
																+                                starpu_data_unregister(data_handles[x][y]);
															
 
																+                }
															
 
																+        }
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+#if 0
															
 
																+        for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+                FPRINTF(stdout, "[%d] ", rank);
															
 
																+                for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+                        FPRINTF(stdout, "%3u ", matrix[x][y]);
															
 
																+                }
															
 
																+                FPRINTF(stdout, "\n");
															
 
																+        }
															
 
																+#endif
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/insert_task_owner.c
+++ b/mpi/tests/insert_task_owner.c
@@ -0,0 +1,180 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <math.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	int node;
															
 
																+	int rank;
															
 
																+
															
 
																+        starpu_codelet_unpack_args(_args, &node);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	FPRINTF(stderr, "Expected node: %d - Actual node: %d\n", node, rank);
															
 
																+
															
 
																+	assert(node == rank);
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet mycodelet_r_w =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_R, STARPU_W}
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet mycodelet_rw_r =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_RW, STARPU_R}
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet mycodelet_rw_rw =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_RW, STARPU_RW}
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet mycodelet_w_r =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_W, STARPU_R}
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet mycodelet_r_r =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_R, STARPU_R}
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int ret, rank, size, err, node;
															
 
																+        int x0=32, x1=23;
															
 
																+        starpu_data_handle_t data_handlesx0;
															
 
																+        starpu_data_handle_t data_handlesx1;
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize_extended(&rank, &size);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																+
															
 
																+        if (size != 2)
															
 
																+	{
															
 
																+		if (rank == 0) FPRINTF(stderr, "We need exactly 2 processes.\n");
															
 
																+                starpu_mpi_shutdown();
															
 
																+                starpu_shutdown();
															
 
																+                return STARPU_TEST_SKIPPED;
															
 
																+        }
															
 
																+
															
 
																+        if (rank == 0)
															
 
																+	{
															
 
																+                starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
															
 
																+                starpu_data_set_rank(data_handlesx0, rank);
															
 
																+		starpu_data_set_tag(data_handlesx0, 0);
															
 
																+                starpu_variable_data_register(&data_handlesx1, -1, (uintptr_t)NULL, sizeof(int));
															
 
																+                starpu_data_set_rank(data_handlesx1, 1);
															
 
																+		starpu_data_set_tag(data_handlesx1, 1);
															
 
																+        }
															
 
																+        else if (rank == 1)
															
 
																+	{
															
 
																+                starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
															
 
																+                starpu_data_set_rank(data_handlesx1, rank);
															
 
																+		starpu_data_set_tag(data_handlesx1, 1);
															
 
																+                starpu_variable_data_register(&data_handlesx0, -1, (uintptr_t)NULL, sizeof(int));
															
 
																+                starpu_data_set_rank(data_handlesx0, 0);
															
 
																+		starpu_data_set_tag(data_handlesx0, 0);
															
 
																+        }
															
 
																+
															
 
																+	node = starpu_data_get_rank(data_handlesx1);
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
															
 
																+				     STARPU_VALUE, &node, sizeof(node),
															
 
																+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
															
 
																+				     0);
															
 
																+        assert(err == 0);
															
 
																+
															
 
																+	node = starpu_data_get_rank(data_handlesx0);
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
															
 
																+				     STARPU_VALUE, &node, sizeof(node),
															
 
																+				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
															
 
																+				     0);
															
 
																+        assert(err == 0);
															
 
																+
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																+				     STARPU_VALUE, &node, sizeof(node),
															
 
																+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
															
 
																+				     0);
															
 
																+        assert(err == -EINVAL);
															
 
																+
															
 
																+	node = 1;
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																+				     STARPU_VALUE, &node, sizeof(node),
															
 
																+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																+				     0);
															
 
																+        assert(err == 0);
															
 
																+
															
 
																+	node = 0;
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
															
 
																+				     STARPU_VALUE, &node, sizeof(node),
															
 
																+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																+				     0);
															
 
																+        assert(err == 0);
															
 
																+
															
 
																+	node = 0;
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
															
 
																+				     STARPU_VALUE, &node, sizeof(node),
															
 
																+				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																+				     0);
															
 
																+        assert(err == 0);
															
 
																+
															
 
																+        /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
															
 
																+           going to overwrite the node even though the data model clearly specifies
															
 
																+           which node is going to execute the codelet */
															
 
																+	node = 0;
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
															
 
																+				     STARPU_VALUE, &node, sizeof(node),
															
 
																+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																+				     0);
															
 
																+        assert(err == 0);
															
 
																+
															
 
																+        /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
															
 
																+           going to overwrite the node even though the data model clearly specifies
															
 
																+           which node is going to execute the codelet */
															
 
																+	node = 0;
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
															
 
																+				     STARPU_VALUE, &node, sizeof(node),
															
 
																+				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
															
 
																+				     0);
															
 
																+        assert(err == 0);
															
 
																+
															
 
																+	fprintf(stderr, "Waiting ...\n");
															
 
																+        starpu_task_wait_for_all();
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
--- a/mpi/tests/insert_task_owner2.c
+++ b/mpi/tests/insert_task_owner2.c
@@ -0,0 +1,120 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <math.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+	int *x2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
															
 
																+	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[3]);
															
 
																+
															
 
																+//        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																+//
															
 
																+//        *x2 = 45;
															
 
																+//        *y = 144;
															
 
																+//
															
 
																+        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																+        *y = (*x0 + *x1) * 100;
															
 
																+        *x1 = 12;
															
 
																+        *x2 = 24;
															
 
																+        *x0 = 36;
															
 
																+        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet mycodelet =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 4,
															
 
																+	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int rank, size, err;
															
 
																+        int x[3], y=0;
															
 
																+        int i, ret;
															
 
																+        starpu_data_handle_t data_handles[4];
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize_extended(&rank, &size);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																+
															
 
																+        if (rank == 0)
															
 
																+	{
															
 
																+                for(i=0 ; i<3 ; i++)
															
 
																+		{
															
 
																+                        x[i] = 10*(i+1);
															
 
																+                        starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
															
 
																+                }
															
 
																+                y = -1;
															
 
																+                starpu_variable_data_register(&data_handles[3], -1, (uintptr_t)NULL, sizeof(int));
															
 
																+        }
															
 
																+        else if (rank == 1)
															
 
																+	{
															
 
																+                for(i=0 ; i<3 ; i++)
															
 
																+		{
															
 
																+                        x[i] = -1;
															
 
																+                        starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
															
 
																+                }
															
 
																+                y=200;
															
 
																+                starpu_variable_data_register(&data_handles[3], 0, (uintptr_t)&y, sizeof(int));
															
 
																+        } else
															
 
																+	{
															
 
																+                for(i=0 ; i<4 ; i++)
															
 
																+                        starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
															
 
																+	}
															
 
																+        FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
															
 
																+
															
 
																+	for(i=0 ; i<3 ; i++)
															
 
																+	{
															
 
																+		starpu_data_set_rank(data_handles[i], 0);
															
 
																+		starpu_data_set_tag(data_handles[i], i);
															
 
																+	}
															
 
																+	starpu_data_set_rank(data_handles[3], 1);
															
 
																+	starpu_data_set_tag(data_handles[3], 3);
															
 
																+
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																+                                     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
															
 
																+                                     STARPU_W, data_handles[2],
															
 
																+                                     STARPU_W, data_handles[3],
															
 
																+                                     STARPU_EXECUTE_ON_NODE, 1, 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
															
 
																+        starpu_task_wait_for_all();
															
 
																+
															
 
																+        int *values = malloc(4 * sizeof(int *));
															
 
																+        for(i=0 ; i<4 ; i++)
															
 
																+	{
															
 
																+                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
															
 
																+		if (rank == 0) {
															
 
																+			starpu_data_acquire(data_handles[i], STARPU_R);
															
 
																+			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
															
 
																+		}
															
 
																+        }
															
 
																+        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
															
 
																+        FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
--- a/mpi/tests/insert_task_owner_data.c
+++ b/mpi/tests/insert_task_owner_data.c
@@ -0,0 +1,99 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <math.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+
															
 
																+	*x0 += 1;
															
 
																+	*x1 *= *x1;
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet mycodelet =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_funcs = {func_cpu, NULL},
															
 
																+        .nbuffers = 2,
															
 
																+	.modes = {STARPU_RW, STARPU_RW}
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int rank, size, err;
															
 
																+        int x[2];
															
 
																+        int ret, i;
															
 
																+        starpu_data_handle_t data_handles[2];
															
 
																+	int values[2];
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize_extended(&rank, &size);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																+
															
 
																+        if (rank == 0)
															
 
																+	{
															
 
																+		x[0] = 11;
															
 
																+		starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x[0], sizeof(x[0]));
															
 
																+		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
															
 
																+        }
															
 
																+        else if (rank == 1)
															
 
																+	{
															
 
																+		x[1] = 12;
															
 
																+		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
															
 
																+		starpu_variable_data_register(&data_handles[1], 0, (uintptr_t)&x[1], sizeof(x[1]));
															
 
																+        }
															
 
																+	else
															
 
																+	{
															
 
																+		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
															
 
																+		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
															
 
																+        }
															
 
																+
															
 
																+	starpu_data_set_rank(data_handles[0], 0);
															
 
																+	starpu_data_set_tag(data_handles[0], 0);
															
 
																+	starpu_data_set_rank(data_handles[1], 1);
															
 
																+	starpu_data_set_tag(data_handles[1], 1);
															
 
																+
															
 
																+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
															
 
																+                                     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
															
 
																+                                     STARPU_EXECUTE_ON_DATA, data_handles[1],
															
 
																+				     0);
															
 
																+        assert(err == 0);
															
 
																+        starpu_task_wait_for_all();
															
 
																+
															
 
																+        for(i=0 ; i<2 ; i++)
															
 
																+	{
															
 
																+                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
															
 
																+		if (rank == 0) {
															
 
																+			starpu_data_acquire(data_handles[i], STARPU_R);
															
 
																+			values[i] = *((int *)starpu_handle_get_local_ptr(data_handles[i]));
															
 
																+		}
															
 
																+        }
															
 
																+        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
															
 
																+	ret = 0;
															
 
																+	if (rank == 0 && (values[0] != 12 || values[1] != 144))
															
 
																+		ret = EXIT_FAILURE;
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
--- a/mpi/tests/mpi_detached_tag.c
+++ b/mpi/tests/mpi_detached_tag.c
@@ -0,0 +1,80 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+#define SIZE	16
															
 
																+
															
 
																+float *tab;
															
 
																+starpu_data_handle_t tab_handle;
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size != 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need exactly 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	tab = malloc(SIZE*sizeof(float));
															
 
																+
															
 
																+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	int other_rank = (rank + 1)%2;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		starpu_tag_t tag = (starpu_tag_t)loop;
															
 
																+
															
 
																+		if ((loop % 2) == rank)
															
 
																+		{
															
 
																+			starpu_mpi_isend_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_mpi_irecv_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
															
 
																+		}
															
 
																+
															
 
																+		starpu_tag_wait(tag);
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/mpi_irecv.c
+++ b/mpi/tests/mpi_irecv.c
@@ -0,0 +1,79 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+#define SIZE	16
															
 
																+
															
 
																+float *tab;
															
 
																+starpu_data_handle_t tab_handle;
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size != 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need exactly 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	tab = malloc(SIZE*sizeof(float));
															
 
																+
															
 
																+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	int other_rank = (rank + 1)%2;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		if ((loop % 2) == rank)
															
 
																+		{
															
 
																+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_req req;
															
 
																+			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_wait(&req, &status);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/mpi_irecv_detached.c
+++ b/mpi/tests/mpi_irecv_detached.c
@@ -0,0 +1,97 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <common/utils.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+#define SIZE	16
															
 
																+
															
 
																+float *tab;
															
 
																+starpu_data_handle_t tab_handle;
															
 
																+
															
 
																+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
															
 
																+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
															
 
																+
															
 
																+void callback(void *arg __attribute__((unused)))
															
 
																+{
															
 
																+	unsigned *received = arg;
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	*received = 1;
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(&cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+}
															
 
																+
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size != 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need exactly 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	tab = malloc(SIZE*sizeof(float));
															
 
																+
															
 
																+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	int other_rank = (rank + 1)%2;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			int received = 0;
															
 
																+			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
															
 
																+
															
 
																+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+			while (!received)
															
 
																+				_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
															
 
																+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/mpi_isend.c
+++ b/mpi/tests/mpi_isend.c
@@ -0,0 +1,80 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+#define SIZE	16
															
 
																+
															
 
																+float *tab;
															
 
																+starpu_data_handle_t tab_handle;
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size != 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need exactly 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	tab = malloc(SIZE*sizeof(float));
															
 
																+
															
 
																+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	int other_rank = (rank + 1)%2;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		if ((loop % 2) == rank)
															
 
																+		{
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_req req;
															
 
																+			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_wait(&req, &status);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/mpi_isend_detached.c
+++ b/mpi/tests/mpi_isend_detached.c
@@ -0,0 +1,98 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <common/utils.h>
															
 
																+#include <pthread.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+#define SIZE	16
															
 
																+
															
 
																+static float *tab;
															
 
																+static starpu_data_handle_t tab_handle;
															
 
																+
															
 
																+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
															
 
																+static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
															
 
																+
															
 
																+void callback(void *arg __attribute__((unused)))
															
 
																+{
															
 
																+	unsigned *sent = arg;
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+	*sent = 1;
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(&cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size != 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need exactly 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	tab = malloc(SIZE*sizeof(float));
															
 
																+
															
 
																+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	int other_rank = (rank + 1)%2;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																+			int sent = 0;
															
 
																+			starpu_mpi_isend_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &sent);
															
 
																+
															
 
																+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																+			while (!sent)
															
 
																+				_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
															
 
																+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/mpi_test.c
+++ b/mpi/tests/mpi_test.c
@@ -0,0 +1,86 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+#define SIZE	16
															
 
																+
															
 
																+float *tab;
															
 
																+starpu_data_handle_t tab_handle;
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size != 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need exactly 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	tab = malloc(SIZE*sizeof(float));
															
 
																+
															
 
																+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	int other_rank = (rank + 1)%2;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		starpu_mpi_req req;
															
 
																+
															
 
																+		if ((loop % 2) == rank)
															
 
																+		{
															
 
																+                        starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+
															
 
																+		int finished = 0;
															
 
																+		do
															
 
																+		{
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_test(&req, &finished, &status);
															
 
																+		}
															
 
																+		while (!finished);
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/multiple_send.c
+++ b/mpi/tests/multiple_send.c
@@ -0,0 +1,92 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+        unsigned send[2] = {42, 11};
															
 
																+        unsigned recv[2] = {33, 33};
															
 
																+        starpu_mpi_req req[2];
															
 
																+        starpu_data_handle_t send_handle[2];
															
 
																+        starpu_data_handle_t recv_handle[2];
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize_extended(&rank, &size);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																+
															
 
																+	if (size < 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need at least 2 processes.\n");
															
 
																+
															
 
																+                starpu_mpi_shutdown();
															
 
																+                starpu_shutdown();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	starpu_variable_data_register(&send_handle[0], 0, (uintptr_t)&send[0], sizeof(unsigned));
															
 
																+	starpu_variable_data_register(&send_handle[1], 0, (uintptr_t)&send[1], sizeof(unsigned));
															
 
																+	starpu_variable_data_register(&recv_handle[0], 0, (uintptr_t)&recv[0], sizeof(unsigned));
															
 
																+	starpu_variable_data_register(&recv_handle[1], 0, (uintptr_t)&recv[1], sizeof(unsigned));
															
 
																+
															
 
																+        if (rank == 0)
															
 
																+	{
															
 
																+                starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
															
 
																+                starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
															
 
																+        }
															
 
																+        else if (rank == 1)
															
 
																+	{
															
 
																+                starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
															
 
																+                starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
															
 
																+        }
															
 
																+
															
 
																+        if (rank == 0 || rank == 1)
															
 
																+	{
															
 
																+                int nb_req=2;
															
 
																+                while (nb_req)
															
 
																+		{
															
 
																+                        int r=0;
															
 
																+                        for(r=0 ; r<2 ; r++)
															
 
																+			{
															
 
																+                                if (req[r])
															
 
																+				{
															
 
																+                                        int finished = 0;
															
 
																+                                        MPI_Status status;
															
 
																+                                        starpu_mpi_test(&req[r], &finished, &status);
															
 
																+                                        STARPU_ASSERT(finished != -1);
															
 
																+                                        if (finished)
															
 
																+					{
															
 
																+                                                FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
															
 
																+                                                req[r] = NULL;
															
 
																+                                                nb_req--;
															
 
																+                                        }
															
 
																+                                }
															
 
																+                        }
															
 
																+                }
															
 
																+        }
															
 
																+        FPRINTF(stderr, "[%d] All requests finished\n", rank);
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/pingpong.c
+++ b/mpi/tests/pingpong.c
@@ -0,0 +1,76 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+#define SIZE	16
															
 
																+
															
 
																+float *tab;
															
 
																+starpu_data_handle_t tab_handle;
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size != 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need exactly 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	tab = malloc(SIZE*sizeof(float));
															
 
																+
															
 
																+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+	int other_rank = (rank + 1)%2;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		if ((loop % 2) == rank)
															
 
																+		{
															
 
																+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/ring.c
+++ b/mpi/tests/ring.c
@@ -0,0 +1,129 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+
															
 
																+unsigned token = 42;
															
 
																+starpu_data_handle_t token_handle;
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+#endif
															
 
																+
															
 
																+void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	(*tokenptr)++;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet increment_cl =
															
 
																+{
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {increment_cuda, NULL},
															
 
																+#endif
															
 
																+	.cpu_funcs = {increment_cpu, NULL},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW}
															
 
																+};
															
 
																+
															
 
																+void increment_token(void)
															
 
																+{
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+	task->cl = &increment_cl;
															
 
																+	task->handles[0] = token_handle;
															
 
																+	task->synchronous = 1;
															
 
																+
															
 
																+	int ret = starpu_task_submit(task);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size < 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need at least 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	unsigned last_loop = nloops - 1;
															
 
																+	unsigned last_rank = size - 1;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		int tag = loop*size + rank;
															
 
																+
															
 
																+		if (loop == 0 && rank == 0)
															
 
																+		{
															
 
																+			token = 0;
															
 
																+			FPRINTF(stdout, "Start with token value %u\n", token);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_recv(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, &status);
															
 
																+		}
															
 
																+
															
 
																+		increment_token();
															
 
																+
															
 
																+		if (loop == last_loop && rank == last_rank)
															
 
																+		{
															
 
																+			starpu_data_acquire(token_handle, STARPU_R);
															
 
																+			FPRINTF(stdout, "Finished : token value %u\n", token);
															
 
																+			starpu_data_release(token_handle);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_mpi_send(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	if (rank == last_rank)
															
 
																+	{
															
 
																+		STARPU_ASSERT(token == nloops*size);
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/ring_async.c
+++ b/mpi/tests/ring_async.c
@@ -0,0 +1,133 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+
															
 
																+unsigned token = 42;
															
 
																+starpu_data_handle_t token_handle;
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+#endif
															
 
																+
															
 
																+void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	(*tokenptr)++;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet increment_cl =
															
 
																+{
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {increment_cuda, NULL},
															
 
																+#endif
															
 
																+	.cpu_funcs = {increment_cpu, NULL},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW}
															
 
																+};
															
 
																+
															
 
																+void increment_token(void)
															
 
																+{
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+	task->cl = &increment_cl;
															
 
																+	task->handles[0] = token_handle;
															
 
																+	task->synchronous = 1;
															
 
																+
															
 
																+	int ret = starpu_task_submit(task);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size < 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need at least 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize();
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
															
 
																+
															
 
																+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	unsigned last_loop = nloops - 1;
															
 
																+	unsigned last_rank = size - 1;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		int tag = loop*size + rank;
															
 
																+
															
 
																+		if (loop == 0 && rank == 0)
															
 
																+		{
															
 
																+			token = 0;
															
 
																+			FPRINTF(stdout, "Start with token value %u\n", token);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_req req;
															
 
																+			starpu_mpi_irecv(token_handle, &req, (rank+size-1)%size, tag, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_wait(&req, &status);
															
 
																+		}
															
 
																+
															
 
																+		increment_token();
															
 
																+
															
 
																+		if (loop == last_loop && rank == last_rank)
															
 
																+		{
															
 
																+			starpu_data_acquire(token_handle, STARPU_R);
															
 
																+			FPRINTF(stdout, "Finished : token value %u\n", token);
															
 
																+			starpu_data_release(token_handle);
															
 
																+		}
															
 
																+		else {
															
 
																+			starpu_mpi_req req;
															
 
																+			MPI_Status status;
															
 
																+			starpu_mpi_isend(token_handle, &req, (rank+1)%size, tag+1, MPI_COMM_WORLD);
															
 
																+			starpu_mpi_wait(&req, &status);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	if (rank == last_rank)
															
 
																+	{
															
 
																+		STARPU_ASSERT(token == nloops*size);
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/ring_async_implicit.c
+++ b/mpi/tests/ring_async_implicit.c
@@ -0,0 +1,133 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#define NITER	2048
															
 
																+
															
 
																+unsigned token = 42;
															
 
																+starpu_data_handle_t token_handle;
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																+#endif
															
 
																+
															
 
																+void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	(*tokenptr)++;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet increment_cl =
															
 
																+{
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = {increment_cuda, NULL},
															
 
																+#endif
															
 
																+	.cpu_funcs = {increment_cpu, NULL},
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_RW}
															
 
																+};
															
 
																+
															
 
																+void increment_token(void)
															
 
																+{
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+	task->cl = &increment_cl;
															
 
																+	task->handles[0] = token_handle;
															
 
																+
															
 
																+	int ret = starpu_task_submit(task);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+
															
 
																+#if 0
															
 
																+	MPI_Init(NULL, NULL);
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	MPI_Comm_size(MPI_COMM_WORLD, &size);
															
 
																+#endif
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_initialize_extended(&rank, &size);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
															
 
																+
															
 
																+	if (size < 2)
															
 
																+	{
															
 
																+		if (rank == 0)
															
 
																+			FPRINTF(stderr, "We need at least 2 processes.\n");
															
 
																+
															
 
																+		MPI_Finalize();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
															
 
																+
															
 
																+	unsigned nloops = NITER;
															
 
																+	unsigned loop;
															
 
																+
															
 
																+	unsigned last_loop = nloops - 1;
															
 
																+	unsigned last_rank = size - 1;
															
 
																+
															
 
																+	for (loop = 0; loop < nloops; loop++)
															
 
																+	{
															
 
																+		int tag = loop*size + rank;
															
 
																+
															
 
																+		if (loop == 0 && rank == 0)
															
 
																+		{
															
 
																+			token = 0;
															
 
																+			FPRINTF(stdout, "Start with token value %u\n", token);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, NULL, NULL);
															
 
																+		}
															
 
																+
															
 
																+		increment_token();
															
 
																+
															
 
																+		if (loop == last_loop && rank == last_rank)
															
 
																+		{
															
 
																+			starpu_data_acquire(token_handle, STARPU_R);
															
 
																+			FPRINTF(stdout, "Finished : token value %u\n", token);
															
 
																+			starpu_data_release(token_handle);
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_task_wait_for_all();
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+        //MPI_Finalize();
															
 
																+
															
 
																+	if (rank == last_rank)
															
 
																+	{
															
 
																+                FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
															
 
																+                STARPU_ASSERT(token == nloops*size);
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/tests/ring_kernel.cu
+++ b/mpi/tests/ring_kernel.cu
@@ -0,0 +1,32 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+static __global__ void cuda_incrementer(unsigned *token)
															
 
																+{
															
 
																+	(*token)++;
															
 
																+}
															
 
																+
															
 
																+extern "C" void increment_cuda(void *descr[], void *_args)
															
 
																+{
															
 
																+	(void) _args;
															
 
																+	unsigned *tokenptr = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+
															
 
																+	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+}