Explorar o código

mic: merge trunk

Thibaud Lambert %!s(int64=12) %!d(string=hai) anos
pai
achega
dfcf80c9fd

+ 58 - 2
mpi/examples/Makefile.am

@@ -44,13 +44,16 @@ BUILT_SOURCES =
 
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
-EXTRA_DIST = 					\
+EXTRA_DIST = 				\
 	mpi_lu/mpi_lu-float.h		\
 	mpi_lu/mpi_lu-double.h		\
 	mpi_lu/plu_example.c		\
+	mpi_lu/plu_implicit_example.c	\
+	mpi_lu/plu_outofcore_example.c	\
 	mpi_lu/plu_solve.c		\
 	mpi_lu/pxlu.h			\
 	mpi_lu/pxlu.c			\
+	mpi_lu/pxlu_implicit.c		\
 	mpi_lu/pxlu_kernels.h		\
 	mpi_lu/pxlu_kernels.c		\
 	matrix_decomposition/mpi_cholesky_codelets.h 	\
@@ -101,7 +104,11 @@ if !NO_BLAS_LIB
 
 examplebin_PROGRAMS += 			\
 	mpi_lu/plu_example_float	\
-	mpi_lu/plu_example_double
+	mpi_lu/plu_example_double	\
+	mpi_lu/plu_implicit_example_float	\
+	mpi_lu/plu_implicit_example_double	\
+	mpi_lu/plu_outofcore_example_float	\
+	mpi_lu/plu_outofcore_example_double
 
 mpi_lu_plu_example_float_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
@@ -126,8 +133,57 @@ mpi_lu_plu_example_double_SOURCES =	\
 	mpi_lu/pdlu_kernels.c	    	\
 	mpi_lu/pdlu.c		    	\
 	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_implicit_example_float_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_implicit_example_float_SOURCES =	\
+	mpi_lu/plu_implicit_example_float.c	\
+	mpi_lu/plu_solve_float.c		\
+	mpi_lu/pslu_kernels.c			\
+	mpi_lu/pslu_implicit.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_implicit_example_double_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_implicit_example_double_SOURCES =	\
+	mpi_lu/plu_outofcore_example_double.c	\
+	mpi_lu/plu_solve_double.c		\
+	mpi_lu/pdlu_kernels.c			\
+	mpi_lu/pdlu_implicit.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_outofcore_example_float_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_outofcore_example_float_SOURCES =	\
+	mpi_lu/plu_outofcore_example_float.c	\
+	mpi_lu/plu_solve_float.c		\
+	mpi_lu/pslu_kernels.c			\
+	mpi_lu/pslu_implicit.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_outofcore_example_double_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_outofcore_example_double_SOURCES =	\
+	mpi_lu/plu_outofcore_example_double.c	\
+	mpi_lu/plu_solve_double.c		\
+	mpi_lu/pdlu_kernels.c			\
+	mpi_lu/pdlu_implicit.c			\
+	$(top_srcdir)/examples/common/blas.c
 endif
 
+
 ########################
 # MPI Cholesky example #
 ########################

+ 19 - 0
mpi/examples/mpi_lu/pdlu_implicit.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "pxlu_implicit.c"

+ 6 - 0
mpi/examples/mpi_lu/plu_example.c

@@ -109,6 +109,12 @@ static void parse_args(int rank, int argc, char **argv)
 			char *argptr;
 			q = strtol(argv[++i], &argptr, 10);
 		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
+			exit(0);
+		}
 	}
 }
 

+ 357 - 0
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -0,0 +1,357 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+#ifdef STARPU_HAVE_LIBNUMA
+#include <numaif.h>
+#endif
+
+static unsigned long size = 4096;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static int p = 1;
+static int q = 1;
+static unsigned display = 0;
+
+#ifdef STARPU_HAVE_LIBNUMA
+static unsigned numa = 0;
+#endif
+
+static size_t allocated_memory = 0;
+static size_t allocated_memory_extra = 0;
+
+static starpu_data_handle_t *dataA_handles;
+static TYPE **dataA;
+
+int get_block_rank(unsigned i, unsigned j);
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-numa") == 0) {
+#ifdef STARPU_HAVE_LIBNUMA
+			numa = 1;
+#else
+			if (rank == 0)
+				fprintf(stderr, "Warning: libnuma is not available\n");
+#endif
+		}
+
+		if (strcmp(argv[i], "-p") == 0) {
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0) {
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
+			exit(0);
+		}
+	}
+}
+
+unsigned STARPU_PLU(display_flag)(void)
+{
+	return display;
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
+{
+	const unsigned block_size = (psize/pnblocks);
+
+	unsigned i, j;
+	for (i = 0; i < block_size; i++)
+	     for (j = 0; j < block_size; j++)
+	     {
+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	     }
+}
+
+static void init_matrix(int rank)
+{
+#ifdef STARPU_HAVE_LIBNUMA
+	if (numa)
+	{
+		fprintf(stderr, "Using INTERLEAVE policy\n");
+		unsigned long nodemask = ((1<<0)|(1<<1));
+		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
+		if (ret)
+			perror("set_mempolicy failed");
+	}
+#endif
+
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
+	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			int block_rank = get_block_rank(i, j);
+			TYPE **blockptr = &dataA[j+i*nblocks];
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+
+			if (block_rank == rank)
+			{
+				/* This blocks should be treated by the current MPI process */
+				/* Allocate and fill it */
+				starpu_malloc((void **)blockptr, blocksize);
+				allocated_memory += blocksize;
+
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				fill_block_with_random(*blockptr, size, nblocks);
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				if (i == j)
+				{
+					unsigned tmp;
+					for (tmp = 0; tmp < size/nblocks; tmp++)
+					{
+						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+					}
+				}
+
+				/* Register it to StarPU */
+				starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM,
+					(uintptr_t)*blockptr, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			else {
+				starpu_matrix_data_register(handleptr, -1,
+					0, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+				*blockptr = STARPU_POISON_PTR;
+			}
+			starpu_data_set_rank(*handleptr, block_rank);
+			starpu_data_set_tag(*handleptr, j+i*nblocks);
+		}
+	}
+
+	//display_all_blocks(nblocks, size/nblocks);
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	return dataA[j+i*nblocks];
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
+static void display_grid(int rank, unsigned pnblocks)
+{
+	if (!display)
+		return;
+
+	//if (rank == 0)
+	{
+		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
+
+		unsigned i, j;
+		for (j = 0; j < pnblocks; j++)
+		{
+			for (i = 0; i < pnblocks; i++)
+			{
+				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
+				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
+
+				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
+			}
+			fprintf(stderr, "\n");
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+
+	starpu_srand48((long int)time(NULL));
+
+	parse_args(argc, argv);
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_cublas_init();
+
+	/*
+	 * 	Problem Init
+	 */
+
+	init_matrix(rank);
+
+	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
+                        (int)(allocated_memory/(1024*1024)),
+			(int)(allocated_memory_extra/(1024*1024)),
+                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));
+
+	display_grid(rank, nblocks);
+
+	TYPE *a_r = NULL;
+//	STARPU_PLU(display_data_content)(a_r, size);
+
+	TYPE *x, *y;
+
+	if (check)
+	{
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			unsigned ind;
+			for (ind = 0; ind < size; ind++)
+				x[ind] = (TYPE)starpu_drand48();
+		}
+
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+	}
+
+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Report performance
+	 */
+
+	if (rank == 0)
+	{
+		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
+
+		unsigned n = size;
+		double flop = (2.0f*n*n*n)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
+
+	/*
+	 *	Test Result Correctness
+	 */
+
+	if (check)
+	{
+		/*
+		 *	Compute || A - LU ||
+		 */
+
+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
+
+#if 0
+		/*
+		 *	Compute || Ax - LUx ||
+		 */
+
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		/* Compute y2 = y2 - y */
+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
+
+		TYPE err = CPU_ASUM(size, y2, 1);
+		int max = CPU_IAMAX(size, y2, 1);
+
+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
+	}
+
+	/*
+	 * 	Termination
+	 */
+
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 19 - 0
mpi/examples/mpi_lu/plu_implicit_example_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_implicit_example.c"

+ 19 - 0
mpi/examples/mpi_lu/plu_implicit_example_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_implicit_example.c"

+ 381 - 0
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -0,0 +1,381 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+#ifdef STARPU_HAVE_LIBNUMA
+#include <numaif.h>
+#endif
+
+static unsigned long size = 4096;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static int p = 1;
+static int q = 1;
+static unsigned display = 0;
+static char *path = "/tmp/starpu-mpi_LU";
+
+#ifdef STARPU_HAVE_LIBNUMA
+static unsigned numa = 0;
+#endif
+
+static size_t allocated_memory = 0;
+
+static starpu_data_handle_t *dataA_handles;
+
+int get_block_rank(unsigned i, unsigned j);
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-numa") == 0) {
+#ifdef STARPU_HAVE_LIBNUMA
+			numa = 1;
+#else
+			if (rank == 0)
+				fprintf(stderr, "Warning: libnuma is not available\n");
+#endif
+		}
+
+		if (strcmp(argv[i], "-p") == 0) {
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0) {
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-path") == 0)
+			path = argv[++i];
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q] [-path PATH]\n", argv[0]);
+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
+			exit(0);
+		}
+	}
+}
+
+unsigned STARPU_PLU(display_flag)(void)
+{
+	return display;
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
+{
+	const unsigned block_size = (psize/pnblocks);
+
+	unsigned i, j;
+	for (i = 0; i < block_size; i++)
+	     for (j = 0; j < block_size; j++)
+	     {
+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	     }
+}
+
+static void create_matrix()
+{
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+	TYPE *blockptr = malloc(blocksize);
+	int fd;
+	char *filename;
+	unsigned filename_length = strlen(path) + 1 + sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1;
+
+	filename = malloc(filename_length);
+
+	allocated_memory += nblocks*nblocks*blocksize*sizeof(TYPE *);
+
+	/* Create the whole matrix on the disk */
+	unsigned i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			fill_block_with_random(blockptr, size, nblocks);
+			if (i == j)
+			{
+				unsigned tmp;
+				for (tmp = 0; tmp < size/nblocks; tmp++)
+				{
+					blockptr[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+				}
+			}
+			snprintf(filename, filename_length, "%s/%u,%u", path, i, j);
+			fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0777);
+			if (fd < 0) {
+				perror("open");
+				exit(1);
+			}
+			if (write(fd, blockptr, blocksize) != (ssize_t) blocksize) {
+				fprintf(stderr,"short write");
+				exit(1);
+			}
+			if (close(fd) < 0) {
+				perror("close");
+				exit(1);
+			}
+		}
+	}
+
+	free(blockptr);
+	free(filename);
+}
+
+static void init_matrix(int rank)
+{
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
+
+	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			int block_rank = get_block_rank(i, j);
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+
+			if (block_rank == rank)
+			{
+				void *disk_obj;
+				snprintf(filename, sizeof(filename), "%u,%u", i, j);
+				/* Register it to StarPU */
+				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
+				if (!disk_obj) {
+					fprintf(stderr,"could not open %s\n", filename);
+					exit(1);
+				}
+				starpu_matrix_data_register(handleptr, disk_node,
+					(uintptr_t) disk_obj, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			else {
+				starpu_matrix_data_register(handleptr, -1,
+					0, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			starpu_data_set_rank(*handleptr, block_rank);
+			starpu_data_set_tag(*handleptr, j+i*nblocks);
+		}
+	}
+
+	//display_all_blocks(nblocks, size/nblocks);
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	/* This does not really make sense in out of core */
+	assert(0);
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+	int ret;
+	unsigned i, j;
+
+	starpu_srand48((long int)time(NULL));
+
+	parse_args(argc, argv);
+
+	ret = mkdir(path, 0777);
+	if (ret != 0 && errno != EEXIST) {
+		fprintf(stderr,"%s does not exist\n", path);
+		exit(1);
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_cublas_init();
+
+	/*
+	 * 	Problem Init
+	 */
+
+	if (rank == 0)
+		create_matrix();
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	init_matrix(rank);
+
+	if (rank == 0)
+		fprintf(stderr, "%dMB on disk\n", (int)(allocated_memory/(1024*1024)));
+
+	TYPE *a_r = NULL;
+//	STARPU_PLU(display_data_content)(a_r, size);
+
+	TYPE *x, *y;
+
+	if (check)
+	{
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			unsigned ind;
+			for (ind = 0; ind < size; ind++)
+				x[ind] = (TYPE)starpu_drand48();
+		}
+
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+	}
+
+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Report performance
+	 */
+
+	if (rank == 0)
+	{
+		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
+
+		unsigned n = size;
+		double flop = (2.0f*n*n*n)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
+
+	/*
+	 *	Test Result Correctness
+	 */
+
+	if (check)
+	{
+		/*
+		 *	Compute || A - LU ||
+		 */
+
+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
+
+#if 0
+		/*
+		 *	Compute || Ax - LUx ||
+		 */
+
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		/* Compute y2 = y2 - y */
+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
+
+		TYPE err = CPU_ASUM(size, y2, 1);
+		int max = CPU_IAMAX(size, y2, 1);
+
+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
+	}
+
+	/*
+	 * 	Termination
+	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+		}
+	}
+
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 19 - 0
mpi/examples/mpi_lu/plu_outofcore_example_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_outofcore_example.c"

+ 19 - 0
mpi/examples/mpi_lu/plu_outofcore_example_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_outofcore_example.c"

+ 19 - 0
mpi/examples/mpi_lu/pslu_implicit.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "pxlu_implicit.c"

+ 162 - 0
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -0,0 +1,162 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pxlu.h"
+#include "pxlu_kernels.h"
+#include <sys/time.h>
+
+//#define VERBOSE_INIT	1
+
+//#define DEBUG	1
+
+static unsigned no_prio = 0;
+
+static unsigned nblocks = 0;
+static int rank = -1;
+static int world_size = -1;
+
+struct callback_arg {
+	unsigned i, j, k;
+};
+
+/*
+ *	Task 11 (diagonal factorization)
+ */
+
+static void create_task_11(unsigned k)
+{
+	starpu_mpi_insert_task(MPI_COMM_WORLD,
+			&STARPU_PLU(cl11),
+			STARPU_VALUE, &k, sizeof(k),
+			STARPU_VALUE, &k, sizeof(k),
+			STARPU_VALUE, &k, sizeof(k),
+			STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
+			STARPU_PRIORITY, !no_prio ?
+				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			0);
+}
+
+/*
+ *	Task 12 (Update lower left (TRSM))
+ */
+
+static void create_task_12(unsigned k, unsigned j)
+{
+#warning temporary fix 
+	starpu_mpi_insert_task(MPI_COMM_WORLD,
+			//&STARPU_PLU(cl12),
+			&STARPU_PLU(cl21),
+			STARPU_VALUE, &j, sizeof(j),
+			STARPU_VALUE, &j, sizeof(j),
+			STARPU_VALUE, &k, sizeof(k),
+			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
+			STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
+			STARPU_PRIORITY, !no_prio && (j == k+1) ?
+				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			0);
+}
+
+/*
+ *	Task 21 (Update upper right (TRSM))
+ */
+
+static void create_task_21(unsigned k, unsigned i)
+{
+#warning temporary fix 
+	starpu_mpi_insert_task(MPI_COMM_WORLD,
+			//&STARPU_PLU(cl21),
+			&STARPU_PLU(cl12),
+			STARPU_VALUE, &i, sizeof(i),
+			STARPU_VALUE, &i, sizeof(i),
+			STARPU_VALUE, &k, sizeof(k),
+			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
+			STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
+			STARPU_PRIORITY, !no_prio && (i == k+1) ?
+				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			0);
+}
+
+/*
+ *	Task 22 (GEMM)
+ */
+
+static void create_task_22(unsigned k, unsigned i, unsigned j)
+{
+	starpu_mpi_insert_task(MPI_COMM_WORLD,
+			&STARPU_PLU(cl22),
+			STARPU_VALUE, &i, sizeof(i),
+			STARPU_VALUE, &j, sizeof(j),
+			STARPU_VALUE, &k, sizeof(k),
+			STARPU_R, STARPU_PLU(get_block_handle)(k, j),
+			STARPU_R, STARPU_PLU(get_block_handle)(i, k),
+			STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
+			STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
+				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			0);
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
+{
+	struct timeval start;
+	struct timeval end;
+
+	nblocks = _nblocks;
+	rank = _rank;
+	world_size = _world_size;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	gettimeofday(&start, NULL);
+
+	for (k = 0; k < nblocks; k++)
+	{
+		create_task_11(k);
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(k, i);
+			create_task_21(k, i);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(k, i, j);
+			}
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	
+//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
+	
+	return timing;
+}

+ 0 - 1
mpi/src/starpu_mpi_collective.c

@@ -57,7 +57,6 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 		callback_arg->nb = 0;
 		callback_arg->callback = (rank == root) ? scallback : rcallback;
 		callback_arg->arg = (rank == root) ? sarg : rarg;
-		if (callback_arg->callback == NULL)
 
 		for(x = 0; x < count ; x++)
 		{

+ 3 - 0
mpi/tests/Makefile.am

@@ -198,5 +198,8 @@ mpi_reduction_SOURCES += mpi_reduction_kernels.c
 user_defined_datatype_SOURCES = user_defined_datatype.c
 user_defined_datatype_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
 
+mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
+mpi_earlyrecv2_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
+
 showcheck:
 	-cat $(TEST_LOGS) /dev/null

+ 159 - 41
mpi/tests/mpi_earlyrecv2.c

@@ -18,81 +18,199 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 #include <unistd.h>
+#include <interface/complex_interface.h>
 
-//#define NB 1000
 #define NB 10
 
-int main(int argc, char **argv)
-{
-	int ret, rank, size, i;
-	starpu_data_handle_t tab_handle[NB];
-	int value[NB];
+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 
-	MPI_Init(NULL, NULL);
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-	if (size%2 != 0)
-	{
-		if (rank == 0)
-			FPRINTF(stderr, "We need a even number of processes.\n");
+void callback(void *arg)
+{
+	unsigned *received = arg;
 
-		MPI_Finalize();
-		return STARPU_TEST_SKIPPED;
-	}
+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	*received = *received + 1;
+	FPRINTF_MPI("Requests %d received\n", *received);
+	STARPU_PTHREAD_COND_SIGNAL(&cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+}
 
-	ret = starpu_init(NULL);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL, 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
-
-	for(i=0 ; i<NB ; i++)
-	{
-		value[i]=i*rank;
-		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
-		starpu_data_set_tag(tab_handle[i], i);
-	}
+typedef void (*check_func)(starpu_data_handle_t handle, int i, int rank, int *error);
 
+int exchange(int rank, starpu_data_handle_t *handles, check_func func, int detached)
+{
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+	int i;
 
 	if (rank%2)
 	{
-		starpu_mpi_send(tab_handle[0], other_rank, 0, MPI_COMM_WORLD);
-		starpu_mpi_send(tab_handle[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
+		starpu_mpi_send(handles[0], other_rank, 0, MPI_COMM_WORLD);
+		starpu_mpi_send(handles[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
 		for(i=1 ; i<NB-1 ; i++)
 		{
-			starpu_mpi_send(tab_handle[i], other_rank, i, MPI_COMM_WORLD);
+			starpu_mpi_send(handles[i], other_rank, i, MPI_COMM_WORLD);
 		}
+		return 0;
 	}
 	else
 	{
+		int ret=0;
 		starpu_mpi_req req[NB];
-		memset(req, 0, NB*sizeof(starpu_mpi_req));
+		int received = 0;
+
+		if (detached)
+		{
+			starpu_mpi_irecv_detached(handles[0], other_rank, 0, MPI_COMM_WORLD, callback, &received);
+		}
+		else
+		{
+			memset(req, 0, NB*sizeof(starpu_mpi_req));
+			starpu_mpi_irecv(handles[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
+			STARPU_ASSERT(req[0] != NULL);
+		}
 
-		starpu_mpi_irecv(tab_handle[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
-		STARPU_ASSERT(req[0] != NULL);
 		// We sleep to make sure that the data for the tag 9 will be received before the recv is posted
 		usleep(2000000);
 		for(i=1 ; i<NB ; i++)
 		{
-			starpu_mpi_irecv(tab_handle[i], &req[i], other_rank, i, MPI_COMM_WORLD);
-			STARPU_ASSERT(req[i] != NULL);
+			if (detached)
+			{
+				starpu_mpi_irecv_detached(handles[i], other_rank, i, MPI_COMM_WORLD, callback, &received);
+			}
+			else
+			{
+				starpu_mpi_irecv(handles[i], &req[i], other_rank, i, MPI_COMM_WORLD);
+				STARPU_ASSERT(req[i] != NULL);
+			}
+		}
+
+		if (detached)
+		{
+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+			while (received != NB)
+			{
+			     FPRINTF_MPI("Received %d messages\n", received);
+			     STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			}
+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		}
-		for(i=0 ; i<NB ; i++)
+		else
 		{
-			starpu_mpi_wait(&req[i], NULL);
-			int *rvalue = (int *)starpu_data_get_local_ptr(tab_handle[i]);
-			STARPU_ASSERT_MSG(*rvalue==i*other_rank, "Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
+			for(i=0 ; i<NB ; i++)
+			{
+			     starpu_mpi_wait(&req[i], NULL);
+			     func(handles[i], i, rank, &ret);
+			}
 		}
+		return ret;
 	}
+}
+
+void check_variable(starpu_data_handle_t handle, int i, int rank, int *error)
+{
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	int *rvalue = (int *)starpu_data_get_local_ptr(handle);
+	if (*rvalue != i*other_rank)
+	{
+		FPRINTF_MPI("Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
+		*error = 1;
+	}
+}
+
+int exchange_variable(int rank, int detached)
+{
+	int ret, i;
+	starpu_data_handle_t tab_handle[NB];
+	int value[NB];
 
+	FPRINTF_MPI("Exchanging variable data with detached=%d\n", detached);
+
+	for(i=0 ; i<NB ; i++)
+	{
+		value[i]=i*rank;
+		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
+		starpu_data_set_tag(tab_handle[i], i);
+	}
+	ret = exchange(rank, tab_handle, check_variable, detached);
 	for(i=0 ; i<NB ; i++)
 		starpu_data_unregister(tab_handle[i]);
 
+	return ret;
+}
+
+void check_complex(starpu_data_handle_t handle, int i, int rank, int *error)
+{
+	double *real = starpu_complex_get_real(handle);
+	double *imaginary = starpu_complex_get_imaginary(handle);
+
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	if ((*real != ((i*other_rank)+12)) || (*imaginary != ((i*other_rank)+45)))
+	{
+		FPRINTF_MPI("Incorrect received value: %f != %d || %f != %d\n", *real, ((i*other_rank)+12), *imaginary, ((i*other_rank)+45));
+		*error = 1;
+	}
+}
+
+int exchange_complex(int rank, int detached)
+{
+	int ret, i;
+	starpu_data_handle_t handle[NB];
+	double real[NB];
+	double imaginary[NB];
+
+	FPRINTF_MPI("Exchanging complex data with detached=%d\n", detached);
+
+	for(i=0 ; i<NB ; i++)
+	{
+		real[i] = (i*rank)+12;
+		imaginary[i] = (i*rank)+45;
+		starpu_complex_data_register(&handle[i], STARPU_MAIN_RAM, &real[i], &imaginary[i], 1);
+		starpu_data_set_tag(handle[i], i);
+	}
+	ret = exchange(rank, handle, check_complex, detached);
+	for(i=0 ; i<NB ; i++)
+		starpu_data_unregister(handle[i]);
+
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	ret = exchange_variable(rank, 0);
+	if (ret == 0)
+		ret = exchange_variable(rank, 1);
+	if (ret == 0)
+		ret = exchange_complex(rank, 0);
+	if (ret == 0)
+		ret = exchange_complex(rank, 1);
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
 	MPI_Finalize();
 
-	return 0;
+	return ret;
 }