8 years ago · c263463c3c
--- a/Makefile.am
+++ b/Makefile.am
@@ -39,6 +39,10 @@ if USE_MPI
 
				 SUBDIRS += mpi
			
 
				 endif
			
 
				 
			
 
				+if USE_NMAD
			
 
				+SUBDIRS += nmad
			
 
				+endif
			
 
				+
			
 
				 if BUILD_EXAMPLES
			
 
				 SUBDIRS += examples
			
 
				 endif
			
--- a/configure.ac
+++ b/configure.ac
@@ -382,6 +382,157 @@ AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
 
				 AC_MSG_RESULT($nmaxmpidev)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
			
 
				 
			
 
				+
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                                    New Madeleine                            #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
			
 
				+                              [Enable StarPU NMAD library generation (disable StarPU MPI)])],
			
 
				+            [enable_nmad=$enableval],
			
 
				+            [enable_nmad=no])
			
 
				+
			
 
				+
			
 
				+#Check MPICC
			
 
				+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
			
 
				+           [Path of the mpicc compiler])],
			
 
				+   [
			
 
				+       if test x$withval = xyes; then
			
 
				+           AC_MSG_ERROR(--with-mpicc must be given a pathname)
			
 
				+       else
			
 
				+           mpicc_path=$withval
			
 
				+       fi
			
 
				+   ],
			
 
				+   [
			
 
				+       if test x$enable_simgrid = xyes ; then
			
 
				+           DEFAULT_MPICC=smpicc
			
 
				+       else
			
 
				+           DEFAULT_MPICC=mpicc
			
 
				+       fi
			
 
				+       # nothing was specified: default value is used
			
 
				+       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
			
 
				+   ])
			
 
				+
			
 
				+# We test if the MPICC compiler exists
			
 
				+if test ! -x $mpicc_path; then
			
 
				+    #MPICC does not exists or is not executable
			
 
				+    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
			
 
				+    use_nmad=no
			
 
				+else
			
 
				+    use_nmad=yes
			
 
				+    if test x$enable_simgrid = xyes ; then
			
 
				+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
			
 
				+                    [Path of the smpirun helper])],
			
 
				+            [
			
 
				+                if test x$withval = xyes; then
			
 
				+                    AC_MSG_ERROR(--with-smpirun must be given a pathname)
			
 
				+                else
			
 
				+                    smpirun_path=$withval
			
 
				+                fi
			
 
				+            ],
			
 
				+            [
			
 
				+                # nothing was specified: default value is used
			
 
				+                AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
			
 
				+            ])
			
 
				+
			
 
				+    fi
			
 
				+fi
			
 
				+
			
 
				+AC_MSG_CHECKING(mpicc path)
			
 
				+AC_MSG_RESULT($mpicc_path)
			
 
				+AC_SUBST(MPICC, $mpicc_path)
			
 
				+
			
 
				+
			
 
				+#Check MPICXX/MPIC++
			
 
				+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
			
 
				+           [Path of the mpicxx/mpic++ compiler])],
			
 
				+   [
			
 
				+       if test x$withval = xyes; then
			
 
				+           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
			
 
				+       else
			
 
				+           mpicxx_path=$withval
			
 
				+       fi
			
 
				+   ],
			
 
				+   [
			
 
				+       if test x$enable_simgrid = xyes ; then
			
 
				+           DEFAULT_MPICXX=smpicxx
			
 
				+       else
			
 
				+           DEFAULT_MPICXX=mpicxx
			
 
				+       fi
			
 
				+       # nothing was specified: default value is used
			
 
				+       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+
			
 
				+       # try with mpic++ if mpicxx was not found
			
 
				+       if test x$mpicxx_path = xno ; then
			
 
				+            DEFAULT_MPICXX=mpic++
			
 
				+            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+       fi
			
 
				+   ])
			
 
				+
			
 
				+# We test if the MPICXX/MPIC++ compiler exists
			
 
				+if test ! -x $mpicxx_path; then
			
 
				+    #MPICXX/MPIC++ does not exists or is not executable
			
 
				+    AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
			
 
				+    use_mpicxx=no
			
 
				+else
			
 
				+    use_mpicxx=yes
			
 
				+fi
			
 
				+
			
 
				+AC_MSG_CHECKING(mpicxx/mpic++ path)
			
 
				+AC_MSG_RESULT($mpicxx_path)
			
 
				+AC_SUBST(MPICXX, $mpicxx_path)
			
 
				+
			
 
				+
			
 
				+if test x$use_mpi = xyes -a \( x$enable_nmad = xyes  \) ; then
			
 
				+    cc_or_mpicc=$mpicc_path
			
 
				+        # For some reason, libtool uses gcc instead of mpicc when linking
			
 
				+        # libstarpumpi.
			
 
				+        # On Darwin (and maybe other systems ?) the linker will fail (undefined
			
 
				+        # references to MPI_*). We manually add the required flags to fix this
			
 
				+        # issue.
			
 
				+        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
			
 
				+else
			
 
				+    cc_or_mpicc=$CC
			
 
				+fi
			
 
				+
			
 
				+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
			
 
				+
			
 
				+AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
			
 
				+				   [Enable StarPU MPI pedantic isend])],
			
 
				+				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
			
 
				+if  test x$enable_mpi_pedantic_isend = xyes; then
			
 
				+	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
			
 
				+fi
			
 
				+
			
 
				+AC_ARG_WITH(mpi-master-slave-multiple-thread, [AS_HELP_STRING([--with-mpi-master-slave-multiple-thread])],
			
 
				+	[AC_DEFINE([STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD], [1], [Use multiple threads to communicate with slaves])])
			
 
				+
			
 
				+AC_MSG_CHECKING(whether the master-slave mode should be enabled)
			
 
				+AC_MSG_RESULT($build_mpi_master_slave)
			
 
				+AM_CONDITIONAL([STARPU_USE_MPI_MASTER_SLAVE], [test x$build_mpi_master_slave = xyes])
			
 
				+
			
 
				+AC_MSG_CHECKING(maximum number of MPI master-slave devices)
			
 
				+AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
			
 
				+			[maximum number of MPI master-slave devices])],
			
 
				+			nmaxmpidev=$enableval,
			
 
				+            [
			
 
				+                 nmaxmpidev=0
			
 
				+            ])
			
 
				+AC_MSG_RESULT($nmaxmpidev)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
			
 
				+
			
 
				+if test x$use_mpi = xyes -a x$enable_nmad = xyes ; then
			
 
				+    build_nmad_lib=yes
			
 
				+    enable_mpi=no
			
 
				+else
			
 
				+    build_nmad_lib=no
			
 
				+fi
			
 
				+
			
 
				+AM_CONDITIONAL(USE_NMAD, test x$build_nmad_lib = xyes)
			
 
				+
			
 
				+
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                                LIBTOOLS                                     #
			
@@ -505,7 +656,7 @@ AC_MSG_RESULT($build_mpi_lib)
 
				 
			
 
				 AC_SUBST(USE_MPI, $build_mpi_lib)
			
 
				 AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes)
			
 
				-if test x$build_mpi_lib = xyes; then
			
 
				+if test x$build_mpi_lib = xyes || test x$build_nmad_lib = xyes; then
			
 
				 	AC_DEFINE(STARPU_USE_MPI,[1],[whether the StarPU MPI library is available])
			
 
				 else
			
 
				 	running_mpi_check=no
			
@@ -3151,6 +3302,11 @@ AC_OUTPUT([
 
				 	mpi/starpumpi-1.1.pc
			
 
				 	mpi/starpumpi-1.2.pc
			
 
				 	mpi/starpumpi-1.3.pc
			
 
				+	nmad/libstarpumpi.pc
			
 
				+	nmad/starpumpi-1.0.pc
			
 
				+	nmad/starpumpi-1.1.pc
			
 
				+	nmad/starpumpi-1.2.pc
			
 
				+	nmad/starpumpi-1.3.pc
			
 
				 	starpufft/Makefile
			
 
				 	starpufft/src/Makefile
			
 
				 	starpufft/tests/Makefile
			
@@ -3169,6 +3325,10 @@ AC_OUTPUT([
 
				 	mpi/src/Makefile
			
 
				 	mpi/tests/Makefile
			
 
				 	mpi/examples/Makefile
			
 
				+	nmad/Makefile
			
 
				+	nmad/src/Makefile
			
 
				+	nmad/tests/Makefile
			
 
				+	nmad/examples/Makefile
			
 
				 	starpu-top/StarPU-Top.pro
			
 
				 	starpu-top/StarPU-Top-qwt-embed.pri
			
 
				 	starpu-top/StarPU-Top-qwt-system.pri
			
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -32,6 +32,11 @@ LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
				 AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
			
 
				 endif
			
 
				 
			
 
				+if USE_NMAD
			
 
				+LIBS += $(top_builddir)/nmad/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+AM_CPPFLAGS += -I$(top_srcdir)/nmad/include
			
 
				+endif
			
 
				+
			
 
				 CC = $(CC_OR_MPICC)
			
 
				 
			
 
				 if STARPU_USE_CUDA
			
--- a/nmad/.gitignore
+++ b/nmad/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/nmad/Makefile.am
+++ b/nmad/Makefile.am
@@ -0,0 +1,31 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2013  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+SUBDIRS=src tests examples
			
 
				+
			
 
				+pkgconfigdir = $(libdir)/pkgconfig
			
 
				+pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc starpumpi-1.1.pc  starpumpi-1.2.pc starpumpi-1.3.pc
			
 
				+
			
 
				+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
			
 
				+versinclude_HEADERS = 					\
			
 
				+	include/starpu_mpi.h
			
 
				+
			
 
				+showcheck:
			
 
				+	RET=0 ; \
			
 
				+	for i in $(SUBDIRS) ; do \
			
 
				+		make -C $$i showcheck || RET=1 ; \
			
 
				+	done ; \
			
 
				+	exit $$RET
			
--- a/nmad/examples/Makefile.am
+++ b/nmad/examples/Makefile.am
@@ -0,0 +1,193 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2013, 2016  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+include $(top_srcdir)/starpu.mk
			
 
				+
			
 
				+CC=$(MPICC)
			
 
				+CCLD=$(MPICC)
			
 
				+
			
 
				+if STARPU_HAVE_WINDOWS
			
 
				+LOADER_BIN		=
			
 
				+else
			
 
				+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
			
 
				+LOADER			=	loader
			
 
				+LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
			
 
				+loader_SOURCES		=	../../tests/loader.c
			
 
				+endif
			
 
				+
			
 
				+if STARPU_HAVE_AM111
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+LOG_COMPILER	 	=	$(MPIEXEC) -np 2 $(LOADER_BIN)
			
 
				+else
			
 
				+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPIEXEC) -np 4
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+if STARPU_MPI_CHECK
			
 
				+TESTS			=	$(starpu_mpi_EXAMPLES)
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+check_PROGRAMS = $(LOADER) $(starpu_mpi_EXAMPLES)
			
 
				+starpu_mpi_EXAMPLES =
			
 
				+
			
 
				+BUILT_SOURCES =
			
 
				+
			
 
				+CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				+
			
 
				+EXTRA_DIST = 					\
			
 
				+	mpi_lu/mpi_lu-float.h		\
			
 
				+	mpi_lu/mpi_lu-double.h		\
			
 
				+	mpi_lu/plu_example.c		\
			
 
				+	mpi_lu/plu_solve.c		\
			
 
				+	mpi_lu/pxlu.h			\
			
 
				+	mpi_lu/pxlu.c			\
			
 
				+	mpi_lu/pxlu_kernels.h		\
			
 
				+	mpi_lu/pxlu_kernels.c		\
			
 
				+	matrix_decomposition/mpi_cholesky_codelets.h 	\
			
 
				+	matrix_decomposition/mpi_cholesky_kernels.h	\
			
 
				+	matrix_decomposition/mpi_cholesky_models.h 	\
			
 
				+	matrix_decomposition/mpi_decomposition_params.h	\
			
 
				+	matrix_decomposition/mpi_decomposition_matrix.h	\
			
 
				+	../tests/helper.h
			
 
				+
			
 
				+examplebindir = $(libdir)/starpu/mpi
			
 
				+
			
 
				+examplebin_PROGRAMS =
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
			
 
				+
			
 
				+.cu.cubin:
			
 
				+	$(MKDIR_P) `dirname $@`
			
 
				+	$(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
			
 
				+
			
 
				+.cu.o:
			
 
				+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
			
 
				+endif
			
 
				+
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
			
 
				+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
			
 
				+
			
 
				+###################
			
 
				+# Stencil example #
			
 
				+###################
			
 
				+if BUILD_EXAMPLES
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	stencil/stencil5
			
 
				+
			
 
				+stencil_stencil5_LDADD =		\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
			
 
				+
			
 
				+starpu_mpi_EXAMPLES	+=	\
			
 
				+	stencil/stencil5
			
 
				+
			
 
				+##################
			
 
				+# MPI LU example #
			
 
				+##################
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+
			
 
				+examplebin_PROGRAMS += 			\
			
 
				+	mpi_lu/plu_example_float	\
			
 
				+	mpi_lu/plu_example_double
			
 
				+
			
 
				+mpi_lu_plu_example_float_LDADD =	\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				+	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				+
			
 
				+mpi_lu_plu_example_float_SOURCES =	\
			
 
				+	mpi_lu/plu_example_float.c	\
			
 
				+	mpi_lu/plu_solve_float.c	\
			
 
				+	mpi_lu/pslu_kernels.c		\
			
 
				+	mpi_lu/pslu.c			\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+
			
 
				+mpi_lu_plu_example_double_LDADD =	\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				+	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				+
			
 
				+mpi_lu_plu_example_double_SOURCES =	\
			
 
				+	mpi_lu/plu_example_double.c	\
			
 
				+	mpi_lu/plu_solve_double.c  	\
			
 
				+	mpi_lu/pdlu_kernels.c	    	\
			
 
				+	mpi_lu/pdlu.c		    	\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+endif
			
 
				+
			
 
				+########################
			
 
				+# MPI Cholesky example #
			
 
				+########################
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	matrix_decomposition/mpi_cholesky			\
			
 
				+	matrix_decomposition/mpi_cholesky_distributed
			
 
				+
			
 
				+matrix_decomposition_mpi_cholesky_SOURCES	=		\
			
 
				+	matrix_decomposition/mpi_cholesky.c		\
			
 
				+	matrix_decomposition/mpi_cholesky_models.c		\
			
 
				+	matrix_decomposition/mpi_cholesky_kernels.c	\
			
 
				+	matrix_decomposition/mpi_cholesky_codelets.c	\
			
 
				+	matrix_decomposition/mpi_decomposition_params.c	\
			
 
				+	matrix_decomposition/mpi_decomposition_matrix.c	\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+
			
 
				+matrix_decomposition_mpi_cholesky_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				+
			
 
				+matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
			
 
				+	matrix_decomposition/mpi_cholesky_distributed.c	\
			
 
				+	matrix_decomposition/mpi_cholesky_models.c		\
			
 
				+	matrix_decomposition/mpi_cholesky_kernels.c	\
			
 
				+	matrix_decomposition/mpi_cholesky_codelets.c	\
			
 
				+	matrix_decomposition/mpi_decomposition_params.c	\
			
 
				+	matrix_decomposition/mpi_decomposition_matrix.c	\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+
			
 
				+matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				+
			
 
				+starpu_mpi_EXAMPLES +=				\
			
 
				+	matrix_decomposition/mpi_cholesky			\
			
 
				+	matrix_decomposition/mpi_cholesky_distributed
			
 
				+endif
			
 
				+
			
 
				+###################
			
 
				+# complex example #
			
 
				+###################
			
 
				+
			
 
				+examplebin_PROGRAMS +=			\
			
 
				+	complex/mpi_complex
			
 
				+
			
 
				+complex_mpi_complex_SOURCES =		\
			
 
				+	complex/mpi_complex.c		\
			
 
				+	$(top_srcdir)/examples/interface/complex_interface.c
			
 
				+
			
 
				+complex_mpi_complex_LDADD =		\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+
			
 
				+starpu_mpi_EXAMPLES	+=			\
			
 
				+	complex/mpi_complex
			
 
				+endif
			
 
				+
			
 
				+
			
--- a/nmad/examples/complex/mpi_complex.c
+++ b/nmad/examples/complex/mpi_complex.c
@@ -0,0 +1,102 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <interface/complex_interface.h>
			
 
				+#include <interface/complex_codelet.h>
			
 
				+
			
 
				+void display_foo_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *foo = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	fprintf(stderr, "foo = %d\n", *foo);
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet foo_display =
			
 
				+{
			
 
				+	.cpu_funcs = {display_foo_codelet},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, nodes;
			
 
				+	int ret;
			
 
				+	int compare;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+
			
 
				+	if (nodes < 2)
			
 
				+	{
			
 
				+		fprintf(stderr, "This program needs at least 2 nodes (%d available)\n", nodes);
			
 
				+		ret = 77;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starpu_data_handle_t handle;
			
 
				+		starpu_data_handle_t handle2;
			
 
				+
			
 
				+		double real[2] = {4.0, 2.0};
			
 
				+		double imaginary[2] = {7.0, 9.0};
			
 
				+
			
 
				+		double real2[2] = {14.0, 12.0};
			
 
				+		double imaginary2[2] = {17.0, 19.0};
			
 
				+
			
 
				+		if (rank == 1)
			
 
				+		{
			
 
				+			real[0] = 0.0;
			
 
				+			real[1] = 0.0;
			
 
				+			imaginary[0] = 0.0;
			
 
				+			imaginary[1] = 0.0;
			
 
				+		}
			
 
				+
			
 
				+		starpu_complex_data_register(&handle, 0, real, imaginary, 2);
			
 
				+		starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			int *compare_ptr = &compare;
			
 
				+
			
 
				+			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle, 0);
			
 
				+			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				+			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				+
			
 
				+			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle2, 0);
			
 
				+			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
			
 
				+		}
			
 
				+		else if (rank == 1)
			
 
				+		{
			
 
				+			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				+			starpu_insert_task(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle, 0);
			
 
				+			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				+		}
			
 
				+
			
 
				+		starpu_task_wait_for_all();
			
 
				+
			
 
				+		starpu_data_unregister(handle);
			
 
				+		starpu_data_unregister(handle2);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (rank == 0) return !compare; else return ret;
			
 
				+}
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky.c
@@ -0,0 +1,72 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "mpi_cholesky_models.h"
			
 
				+#include "mpi_cholesky_codelets.h"
			
 
				+#include "mpi_decomposition_matrix.h"
			
 
				+#include "mpi_decomposition_params.h"
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				+	 * */
			
 
				+
			
 
				+	float ***bmat;
			
 
				+	int rank, nodes, ret;
			
 
				+	double timing, flops;
			
 
				+	int correctness;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+	starpu_cublas_init();
			
 
				+
			
 
				+	parse_args(argc, argv, nodes);
			
 
				+
			
 
				+	matrix_init(&bmat, rank, nodes, 1);
			
 
				+	matrix_display(bmat, rank);
			
 
				+
			
 
				+	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+
			
 
				+	matrix_display(bmat, rank);
			
 
				+
			
 
				+	dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops);
			
 
				+
			
 
				+	matrix_free(&bmat, rank, nodes, 1);
			
 
				+	starpu_cublas_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	assert(correctness);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -0,0 +1,260 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <common/blas.h>
			
 
				+#include "mpi_decomposition_params.h"
			
 
				+#include "mpi_decomposition_matrix.h"
			
 
				+#include "mpi_cholesky_models.h"
			
 
				+#include "mpi_cholesky_codelets.h"
			
 
				+#include "mpi_cholesky_kernels.h"
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+/*
			
 
				+ *	Create the codelets
			
 
				+ */
			
 
				+
			
 
				+static struct starpu_codelet cl11 =
			
 
				+{
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11},
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+	.model = &chol_model_11
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl21 =
			
 
				+{
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21},
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW},
			
 
				+	.model = &chol_model_21
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl22 =
			
 
				+{
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22},
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				+	.model = &chol_model_22
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization
			
 
				+ *	and construct the DAG
			
 
				+ */
			
 
				+void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops)
			
 
				+{
			
 
				+	double start;
			
 
				+	double end;
			
 
				+	starpu_data_handle_t **data_handles;
			
 
				+	unsigned x,y,i,j,k;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+
			
 
				+	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
			
 
				+	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+	for(x = 0; x < nblocks ; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < nblocks; y++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, y, nodes);
			
 
				+			if (mpi_rank == rank)
			
 
				+			{
			
 
				+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
			
 
				+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
			
 
				+						ld, size/nblocks, size/nblocks, sizeof(float));
			
 
				+			}
			
 
				+#warning TODO: make better test to only register what is needed
			
 
				+			else
			
 
				+			{
			
 
				+				/* I don't own that index, but will need it for my computations */
			
 
				+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
			
 
				+						ld, size/nblocks, size/nblocks, sizeof(float));
			
 
				+			}
			
 
				+			if (data_handles[x][y])
			
 
				+			{
			
 
				+				starpu_mpi_data_register(data_handles[x][y], (y*nblocks)+x, mpi_rank);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	start = starpu_timing_now();
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		int prio = STARPU_DEFAULT_PRIO;
			
 
				+		if (!noprio) prio = STARPU_MAX_PRIO;
			
 
				+
			
 
				+		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
			
 
				+				STARPU_PRIORITY, prio,
			
 
				+				STARPU_RW, data_handles[k][k],
			
 
				+				0);
			
 
				+
			
 
				+		for (j = k+1; j<nblocks; j++)
			
 
				+		{
			
 
				+			prio = STARPU_DEFAULT_PRIO;
			
 
				+			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
			
 
				+			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
			
 
				+					STARPU_PRIORITY, prio,
			
 
				+					STARPU_R, data_handles[k][k],
			
 
				+					STARPU_RW, data_handles[k][j],
			
 
				+					0);
			
 
				+
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
			
 
				+
			
 
				+			for (i = k+1; i<nblocks; i++)
			
 
				+			{
			
 
				+				if (i <= j)
			
 
				+				{
			
 
				+					prio = STARPU_DEFAULT_PRIO;
			
 
				+					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
			
 
				+					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
			
 
				+							STARPU_PRIORITY, prio,
			
 
				+							STARPU_R, data_handles[k][i],
			
 
				+							STARPU_R, data_handles[k][j],
			
 
				+							STARPU_RW, data_handles[i][j],
			
 
				+							0);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][j]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(x = 0; x < nblocks ; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < nblocks; y++)
			
 
				+		{
			
 
				+			if (data_handles[x][y])
			
 
				+				starpu_data_unregister(data_handles[x][y]);
			
 
				+		}
			
 
				+		free(data_handles[x]);
			
 
				+	}
			
 
				+	free(data_handles);
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	end = starpu_timing_now();
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		*timing = end - start;
			
 
				+		*flops = (1.0f*size*size*size)/3.0f;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops)
			
 
				+{
			
 
				+	unsigned i,j,x,y;
			
 
				+	float *rmat = malloc(size*size*sizeof(float));
			
 
				+
			
 
				+	for(x=0 ; x<nblocks ; x++)
			
 
				+	{
			
 
				+		for(y=0 ; y<nblocks ; y++)
			
 
				+		{
			
 
				+			for (i = 0; i < BLOCKSIZE; i++)
			
 
				+			{
			
 
				+				for (j = 0; j < BLOCKSIZE; j++)
			
 
				+				{
			
 
				+					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = matA[x][y][j +i*BLOCKSIZE];
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
			
 
				+	for (j = 0; j < size; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			if (i > j)
			
 
				+			{
			
 
				+				rmat[j+i*size] = 0.0f; // debug
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	float *test_mat = malloc(size*size*sizeof(float));
			
 
				+	STARPU_ASSERT(test_mat);
			
 
				+
			
 
				+	STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				+			rmat, size, 0.0f, test_mat, size);
			
 
				+
			
 
				+	fprintf(stderr, "[%d] comparing results ...\n", rank);
			
 
				+	if (display)
			
 
				+	{
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < size; i++)
			
 
				+			{
			
 
				+				if (i <= j)
			
 
				+				{
			
 
				+					printf("%2.2f\t", test_mat[j +i*size]);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					printf(".\t");
			
 
				+				}
			
 
				+			}
			
 
				+			printf("\n");
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	*correctness = 1;
			
 
				+	for(x = 0; x < nblocks ; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < nblocks; y++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, y, nodes);
			
 
				+			if (mpi_rank == rank)
			
 
				+			{
			
 
				+				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
			
 
				+				{
			
 
				+					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
			
 
				+					{
			
 
				+						if (i <= j)
			
 
				+						{
			
 
				+							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				+							float err = abs(test_mat[j +i*size] - orig);
			
 
				+							if (err > 0.00001)
			
 
				+							{
			
 
				+								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
			
 
				+								*correctness = 0;
			
 
				+								*flops = 0;
			
 
				+								break;
			
 
				+							}
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	free(rmat);
			
 
				+	free(test_mat);
			
 
				+}
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_codelets.h
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_codelets.h
@@ -0,0 +1,30 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MPI_CHOLESKY_CODELETS_H__
			
 
				+#define __MPI_CHOLESKY_CODELETS_H__
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization
			
 
				+ *	and construct the DAG
			
 
				+ */
			
 
				+void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops);
			
 
				+
			
 
				+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops);
			
 
				+
			
 
				+#endif /* __MPI_CHOLESKY_CODELETS_H__ */
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_distributed.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_distributed.c
@@ -0,0 +1,64 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "mpi_cholesky_models.h"
			
 
				+#include "mpi_cholesky_codelets.h"
			
 
				+#include "mpi_decomposition_matrix.h"
			
 
				+#include "mpi_decomposition_params.h"
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* create a simple definite positive symetric matrix example
			
 
				+	 *
			
 
				+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				+	 * */
			
 
				+
			
 
				+	float ***bmat;
			
 
				+	int rank, nodes, ret;
			
 
				+	double timing, flops;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+	starpu_cublas_init();
			
 
				+
			
 
				+	parse_args(argc, argv, nodes);
			
 
				+
			
 
				+	matrix_init(&bmat, rank, nodes, 0);
			
 
				+
			
 
				+	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+
			
 
				+	matrix_free(&bmat, rank, nodes, 0);
			
 
				+	starpu_cublas_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_kernels.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_kernels.c
@@ -0,0 +1,247 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2012-2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+#include "mpi_decomposition_params.h"
			
 
				+#include "common/blas.h"
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <cublas.h>
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+#include "magma.h"
			
 
				+#include "magma_lapack.h"
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * U22
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	//printf("22\n");
			
 
				+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus st;
			
 
				+#endif
			
 
				+
			
 
				+	switch (s)
			
 
				+	{
			
 
				+		case 0:
			
 
				+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
			
 
				+				right, ld12, 1.0f, center, ld22);
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+			cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				+#endif
			
 
				+			cublasSgemm('n', 't', dy, dx, dz,
			
 
				+					-1.0f, left, ld21, right, ld12,
			
 
				+					 1.0f, center, ld22);
			
 
				+			st = cublasGetError();
			
 
				+			if (STARPU_UNLIKELY(st != CUBLAS_STATUS_SUCCESS))
			
 
				+				STARPU_CUBLAS_REPORT_ERROR(st);
			
 
				+
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
			
 
				+}
			
 
				+#endif// STARPU_USE_CUDA
			
 
				+
			
 
				+/*
			
 
				+ * U21
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+//	printf("21\n");
			
 
				+	float *sub11;
			
 
				+	float *sub21;
			
 
				+
			
 
				+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+
			
 
				+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+
			
 
				+	switch (s)
			
 
				+	{
			
 
				+		case 0:
			
 
				+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+			cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				+#endif
			
 
				+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
			
 
				+{
			
 
				+	 chol_common_codelet_update_u21(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u21(descr, 1, _args);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ *	U11
			
 
				+ */
			
 
				+
			
 
				+static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+//	printf("11\n");
			
 
				+	float *sub11;
			
 
				+
			
 
				+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+
			
 
				+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+	unsigned z;
			
 
				+
			
 
				+	switch (s)
			
 
				+	{
			
 
				+		case 0:
			
 
				+
			
 
				+#ifdef STARPU_MKL
			
 
				+			STARPU_SPOTRF("L", nx, sub11, ld);
			
 
				+#else
			
 
				+			/*
			
 
				+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
			
 
				+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
			
 
				+			 *	- A22 <- A22 - l21 trans(l21)
			
 
				+			 */
			
 
				+
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				float lambda11;
			
 
				+				lambda11 = sqrt(sub11[z+z*ld]);
			
 
				+				sub11[z+z*ld] = lambda11;
			
 
				+
			
 
				+				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				+
			
 
				+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+
			
 
				+				STARPU_SSYR("L", nx - z - 1, -1.0f,
			
 
				+							&sub11[(z+1)+z*ld], 1,
			
 
				+							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+			}
			
 
				+#endif
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+			{
			
 
				+				int ret;
			
 
				+				int info;
			
 
				+				ret = magma_spotrf_gpu(MagmaLower, nx, sub11, ld, &info);
			
 
				+				if (ret != MAGMA_SUCCESS)
			
 
				+				{
			
 
				+					fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				+					STARPU_ABORT();
			
 
				+				}
			
 
				+				cudaError_t cures = cudaThreadSynchronize();
			
 
				+				STARPU_ASSERT(!cures);
			
 
				+			}
			
 
				+#else
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				float lambda11;
			
 
				+				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				+
			
 
				+				lambda11 = sqrt(lambda11);
			
 
				+
			
 
				+				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
			
 
				+
			
 
				+				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				+
			
 
				+				cublasSsyr('U', nx - z - 1, -1.0f,
			
 
				+							&sub11[(z+1)+z*ld], 1,
			
 
				+							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				+			}
			
 
				+
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+#endif
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u11(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
			
 
				+{
			
 
				+	chol_common_codelet_update_u11(descr, 1, _args);
			
 
				+}
			
 
				+#endif// STARPU_USE_CUDA
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_kernels.h
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_kernels.h
@@ -0,0 +1,33 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MPI_CHOLESKY_KERNELS_H__
			
 
				+#define __MPI_CHOLESKY_KERNELS_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+void chol_cpu_codelet_update_u11(void **, void *);
			
 
				+void chol_cpu_codelet_update_u21(void **, void *);
			
 
				+void chol_cpu_codelet_update_u22(void **, void *);
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
			
 
				+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
			
 
				+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				+#endif
			
 
				+
			
 
				+#endif // __MPI_CHOLESKY_KERNELS_H__
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_models.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_models.c
@@ -0,0 +1,40 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_cholesky_models.h"
			
 
				+
			
 
				+/*
			
 
				+ *	Number of flops of Gemm
			
 
				+ */
			
 
				+
			
 
				+struct starpu_perfmodel chol_model_11 =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_11"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel chol_model_21 =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_21"
			
 
				+};
			
 
				+
			
 
				+struct starpu_perfmodel chol_model_22 =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "chol_model_22"
			
 
				+};
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_models.h
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_models.h
@@ -0,0 +1,27 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __DW_CHOLESKY_MODELS_H__
			
 
				+#define __DW_CHOLESKY_MODELS_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+extern struct starpu_perfmodel chol_model_11;
			
 
				+extern struct starpu_perfmodel chol_model_21;
			
 
				+extern struct starpu_perfmodel chol_model_22;
			
 
				+
			
 
				+#endif // __DW_CHOLESKY_MODELS_H__
			
--- a/nmad/examples/matrix_decomposition/mpi_decomposition_matrix.c
+++ b/nmad/examples/matrix_decomposition/mpi_decomposition_matrix.c
@@ -0,0 +1,110 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2012, 2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "mpi_decomposition_matrix.h"
			
 
				+#include "mpi_decomposition_params.h"
			
 
				+#include "mpi_cholesky_codelets.h"
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x, int y, int nb_nodes)
			
 
				+{
			
 
				+	//return (x+y) % nb_nodes;
			
 
				+	return (x%dblockx)+(y%dblocky)*dblockx;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void matrix_display(float ***bmat, int rank)
			
 
				+{
			
 
				+	unsigned i,j,x,y;
			
 
				+
			
 
				+	if (display)
			
 
				+	{
			
 
				+		printf("[%d] Input :\n", rank);
			
 
				+
			
 
				+		for(y=0 ; y<nblocks ; y++)
			
 
				+		{
			
 
				+			for(x=0 ; x<nblocks ; x++)
			
 
				+			{
			
 
				+				printf("Block %u,%u :\n", x, y);
			
 
				+				for (j = 0; j < BLOCKSIZE; j++)
			
 
				+				{
			
 
				+					for (i = 0; i < BLOCKSIZE; i++)
			
 
				+					{
			
 
				+						if (i <= j)
			
 
				+						{
			
 
				+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
			
 
				+						}
			
 
				+						else
			
 
				+						{
			
 
				+							printf(".\t");
			
 
				+						}
			
 
				+					}
			
 
				+					printf("\n");
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
			
 
				+{
			
 
				+	unsigned i,j,x,y;
			
 
				+
			
 
				+	*bmat = malloc(nblocks * sizeof(float **));
			
 
				+	for(x=0 ; x<nblocks ; x++)
			
 
				+	{
			
 
				+		(*bmat)[x] = malloc(nblocks * sizeof(float *));
			
 
				+		for(y=0 ; y<nblocks ; y++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, y, nodes);
			
 
				+			if (alloc_everywhere || (mpi_rank == rank))
			
 
				+			{
			
 
				+				starpu_malloc((void **)&(*bmat)[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				+				for (i = 0; i < BLOCKSIZE; i++)
			
 
				+				{
			
 
				+					for (j = 0; j < BLOCKSIZE; j++)
			
 
				+					{
			
 
				+						(*bmat)[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
			
 
				+						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void matrix_free(float ****bmat, int rank, int nodes, int alloc_everywhere)
			
 
				+{
			
 
				+	unsigned x, y;
			
 
				+
			
 
				+	for(x=0 ; x<nblocks ; x++)
			
 
				+	{
			
 
				+		for(y=0 ; y<nblocks ; y++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, y, nodes);
			
 
				+			if (alloc_everywhere || (mpi_rank == rank))
			
 
				+			{
			
 
				+				starpu_free((void *)(*bmat)[x][y]);
			
 
				+			}
			
 
				+		}
			
 
				+		free((*bmat)[x]);
			
 
				+	}
			
 
				+	free(*bmat);
			
 
				+}
			
 
				+
			
--- a/nmad/examples/matrix_decomposition/mpi_decomposition_matrix.h
+++ b/nmad/examples/matrix_decomposition/mpi_decomposition_matrix.h
@@ -0,0 +1,30 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MPI_CHOLESKY_MATRIX_H__
			
 
				+#define __MPI_CHOLESKY_MATRIX_H__
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x, int y, int nb_nodes);
			
 
				+
			
 
				+void matrix_display(float ***bmat, int rank);
			
 
				+void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere);
			
 
				+void matrix_free(float ****bmat, int rank, int nodes, int alloc_everywhere);
			
 
				+
			
 
				+#endif /* __MPI_CHOLESKY_MATRIX_H__ */
			
 
				+
			
--- a/nmad/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/nmad/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -0,0 +1,100 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+unsigned size = 4*960;
			
 
				+unsigned nblocks = 16;
			
 
				+unsigned nbigblocks = 2;
			
 
				+unsigned noprio = 0;
			
 
				+unsigned display = 0;
			
 
				+int dblockx = -1;
			
 
				+int dblocky = -1;
			
 
				+
			
 
				+void parse_args(int argc, char **argv, int nodes)
			
 
				+{
			
 
				+        int i;
			
 
				+        for (i = 1; i < argc; i++)
			
 
				+        {
			
 
				+                if (strcmp(argv[i], "-size") == 0)
			
 
				+                {
			
 
				+                        char *argptr;
			
 
				+                        size = strtol(argv[++i], &argptr, 10);
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-dblockx") == 0)
			
 
				+                {
			
 
				+                        char *argptr;
			
 
				+                        dblockx = strtol(argv[++i], &argptr, 10);
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-dblocky") == 0)
			
 
				+                {
			
 
				+                        char *argptr;
			
 
				+                        dblocky = strtol(argv[++i], &argptr, 10);
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+                {
			
 
				+                        char *argptr;
			
 
				+                        nblocks = strtol(argv[++i], &argptr, 10);
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-nbigblocks") == 0)
			
 
				+                {
			
 
				+                        char *argptr;
			
 
				+                        nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-no-prio") == 0)
			
 
				+                {
			
 
				+                        noprio = 1;
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-display") == 0)
			
 
				+                {
			
 
				+                        display = 1;
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-h") == 0)
			
 
				+                {
			
 
				+                        printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+        if (nblocks > size) nblocks = size;
			
 
				+
			
 
				+	if (dblockx == -1 || dblocky == -1)
			
 
				+	{
			
 
				+		int factor;
			
 
				+		dblockx = nodes;
			
 
				+		dblocky = 1;
			
 
				+		for(factor=sqrt(nodes) ; factor>1 ; factor--)
			
 
				+		{
			
 
				+			if (nodes % factor == 0)
			
 
				+			{
			
 
				+				dblockx = nodes/factor;
			
 
				+				dblocky = factor;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
--- a/nmad/examples/matrix_decomposition/mpi_decomposition_params.h
+++ b/nmad/examples/matrix_decomposition/mpi_decomposition_params.h
@@ -0,0 +1,34 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MPI_CHOLESKY_PARAMS_H__
			
 
				+#define __MPI_CHOLESKY_PARAMS_H__
			
 
				+
			
 
				+#define BLOCKSIZE       (size/nblocks)
			
 
				+
			
 
				+extern unsigned size;
			
 
				+extern unsigned nblocks;
			
 
				+extern unsigned nbigblocks;
			
 
				+extern unsigned noprio;
			
 
				+extern unsigned display;
			
 
				+extern unsigned dblockx;
			
 
				+extern unsigned dblocky;
			
 
				+
			
 
				+void parse_args(int argc, char **argv, int nodes);
			
 
				+
			
 
				+#endif // __MPI_CHOLESKY_PARAMS_H__
			
 
				+
			
--- a/nmad/examples/mpi_lu/mpi_lu-double.h
+++ b/nmad/examples/mpi_lu/mpi_lu-double.h
@@ -0,0 +1,42 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#define TYPE double
			
 
				+#define MPI_TYPE	MPI_DOUBLE
			
 
				+
			
 
				+#define STARPU_PLU(name)       starpu_pdlu_##name
			
 
				+
			
 
				+#define CUBLAS_GEMM	cublasDgemm
			
 
				+#define CUBLAS_TRSM	cublasDtrsm
			
 
				+#define CUBLAS_SCAL	cublasDscal
			
 
				+#define CUBLAS_GER	cublasDger
			
 
				+#define CUBLAS_SWAP	cublasDswap
			
 
				+#define CUBLAS_IAMAX	cublasIdamax
			
 
				+
			
 
				+#define CPU_GEMM	STARPU_DGEMM
			
 
				+#define CPU_GEMV	STARPU_DGEMV
			
 
				+#define CPU_TRSM	STARPU_DTRSM
			
 
				+#define CPU_SCAL	STARPU_DSCAL
			
 
				+#define CPU_GER		STARPU_DGER
			
 
				+#define CPU_SWAP	STARPU_DSWAP
			
 
				+
			
 
				+#define CPU_TRMM	STARPU_DTRMM
			
 
				+#define CPU_AXPY	STARPU_DAXPY
			
 
				+#define CPU_ASUM	STARPU_DASUM
			
 
				+#define CPU_IAMAX	STARPU_IDAMAX
			
 
				+
			
 
				+#define PIVOT_THRESHHOLD	10e-10
			
--- a/nmad/examples/mpi_lu/mpi_lu-float.h
+++ b/nmad/examples/mpi_lu/mpi_lu-float.h
@@ -0,0 +1,42 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#define TYPE float
			
 
				+#define MPI_TYPE	MPI_FLOAT
			
 
				+
			
 
				+#define STARPU_PLU(name)       starpu_pslu_##name
			
 
				+
			
 
				+#define CUBLAS_GEMM	cublasSgemm
			
 
				+#define CUBLAS_TRSM	cublasStrsm
			
 
				+#define CUBLAS_SCAL	cublasSscal
			
 
				+#define CUBLAS_GER	cublasSger
			
 
				+#define CUBLAS_SWAP	cublasSswap
			
 
				+#define CUBLAS_IAMAX	cublasIsamax
			
 
				+
			
 
				+#define CPU_GEMM	STARPU_SGEMM
			
 
				+#define CPU_GEMV	STARPU_SGEMV
			
 
				+#define CPU_TRSM	STARPU_STRSM
			
 
				+#define CPU_SCAL	STARPU_SSCAL
			
 
				+#define CPU_GER		STARPU_SGER
			
 
				+#define CPU_SWAP	STARPU_SSWAP
			
 
				+
			
 
				+#define CPU_TRMM	STARPU_STRMM
			
 
				+#define CPU_AXPY	STARPU_SAXPY
			
 
				+#define CPU_ASUM	STARPU_SASUM
			
 
				+#define CPU_IAMAX	STARPU_ISAMAX
			
 
				+
			
 
				+#define PIVOT_THRESHHOLD	10e-5
			
--- a/nmad/examples/mpi_lu/pdlu.c
+++ b/nmad/examples/mpi_lu/pdlu.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-double.h"
			
 
				+#include "pxlu.c"
			
--- a/nmad/examples/mpi_lu/pdlu_kernels.c
+++ b/nmad/examples/mpi_lu/pdlu_kernels.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-double.h"
			
 
				+#include "pxlu_kernels.c"
			
--- a/nmad/examples/mpi_lu/plu_example.c
+++ b/nmad/examples/mpi_lu/plu_example.c
@@ -0,0 +1,581 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <time.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+//#include "pxlu_kernels.h"
			
 
				+
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+#include <numaif.h>
			
 
				+#endif
			
 
				+
			
 
				+static unsigned long size = 4096;
			
 
				+static unsigned nblocks = 16;
			
 
				+static unsigned check = 0;
			
 
				+static int p = 1;
			
 
				+static int q = 1;
			
 
				+static unsigned display = 0;
			
 
				+
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+static unsigned numa = 0;
			
 
				+#endif
			
 
				+
			
 
				+static size_t allocated_memory = 0;
			
 
				+static size_t allocated_memory_extra = 0;
			
 
				+
			
 
				+static starpu_data_handle_t *dataA_handles;
			
 
				+static TYPE **dataA;
			
 
				+
			
 
				+/* In order to implement the distributed LU decomposition, we allocate
			
 
				+ * temporary buffers */
			
 
				+#ifdef SINGLE_TMP11
			
 
				+static starpu_data_handle_t tmp_11_block_handle;
			
 
				+static TYPE *tmp_11_block;
			
 
				+#else
			
 
				+static starpu_data_handle_t *tmp_11_block_handles;
			
 
				+static TYPE **tmp_11_block;
			
 
				+#endif
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+static starpu_data_handle_t *tmp_12_block_handles;
			
 
				+static TYPE **tmp_12_block;
			
 
				+static starpu_data_handle_t *tmp_21_block_handles;
			
 
				+static TYPE **tmp_21_block;
			
 
				+#else
			
 
				+static starpu_data_handle_t *(tmp_12_block_handles[2]);
			
 
				+static TYPE **(tmp_12_block[2]);
			
 
				+static starpu_data_handle_t *(tmp_21_block_handles[2]);
			
 
				+static TYPE **(tmp_21_block[2]);
			
 
				+#endif
			
 
				+
			
 
				+int get_block_rank(unsigned i, unsigned j);
			
 
				+
			
 
				+static void parse_args(int rank, int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-size") == 0) {
			
 
				+			char *argptr;
			
 
				+			size = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+			char *argptr;
			
 
				+			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-check") == 0) {
			
 
				+			check = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-display") == 0) {
			
 
				+			display = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-numa") == 0) {
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+			numa = 1;
			
 
				+#else
			
 
				+			if (rank == 0)
			
 
				+				fprintf(stderr, "Warning: libnuma is not available\n");
			
 
				+#endif
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-p") == 0) {
			
 
				+			char *argptr;
			
 
				+			p = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-q") == 0) {
			
 
				+			char *argptr;
			
 
				+			q = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
			
 
				+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
			
 
				+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+unsigned STARPU_PLU(display_flag)(void)
			
 
				+{
			
 
				+	return display;
			
 
				+}
			
 
				+
			
 
				+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
			
 
				+{
			
 
				+	const unsigned block_size = (psize/pnblocks);
			
 
				+
			
 
				+	unsigned i, j;
			
 
				+	for (i = 0; i < block_size; i++)
			
 
				+	     for (j = 0; j < block_size; j++)
			
 
				+	     {
			
 
				+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
			
 
				+	     }
			
 
				+}
			
 
				+
			
 
				+#ifdef SINGLE_TMP11
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void)
			
 
				+{
			
 
				+	return tmp_11_block_handle;
			
 
				+}
			
 
				+#else
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k)
			
 
				+{
			
 
				+	return tmp_11_block_handles[k];
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
			
 
				+{
			
 
				+	return tmp_12_block_handles[j];
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
			
 
				+{
			
 
				+	return tmp_21_block_handles[i];
			
 
				+}
			
 
				+#else
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k)
			
 
				+{
			
 
				+	return tmp_12_block_handles[k%2][j];
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k)
			
 
				+{
			
 
				+	return tmp_21_block_handles[k%2][i];
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static unsigned tmp_11_block_is_needed(int rank, unsigned pnblocks, unsigned k)
			
 
				+{
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static unsigned tmp_12_block_is_needed(int rank, unsigned pnblocks, unsigned j)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	for (i = 1; i < pnblocks; i++)
			
 
				+	{
			
 
				+		if (get_block_rank(i, j) == rank)
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static unsigned tmp_21_block_is_needed(int rank, unsigned pnblocks, unsigned i)
			
 
				+{
			
 
				+	unsigned j;
			
 
				+	for (j = 1; j < pnblocks; j++)
			
 
				+	{
			
 
				+		if (get_block_rank(i, j) == rank)
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void init_matrix(int rank)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+	if (numa)
			
 
				+	{
			
 
				+		fprintf(stderr, "Using INTERLEAVE policy\n");
			
 
				+		unsigned long nodemask = ((1<<0)|(1<<1));
			
 
				+		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
			
 
				+		if (ret)
			
 
				+			perror("set_mempolicy failed");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
			
 
				+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
			
 
				+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
			
 
				+	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
			
 
				+
			
 
				+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				+
			
 
				+	/* Allocate all the blocks that belong to this mpi node */
			
 
				+	unsigned long i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			TYPE **blockptr = &dataA[j+i*nblocks];
			
 
				+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				+
			
 
				+			if (get_block_rank(i, j) == rank)
			
 
				+			{
			
 
				+				/* This blocks should be treated by the current MPI process */
			
 
				+				/* Allocate and fill it */
			
 
				+				starpu_malloc((void **)blockptr, blocksize);
			
 
				+				allocated_memory += blocksize;
			
 
				+
			
 
				+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
			
 
				+				fill_block_with_random(*blockptr, size, nblocks);
			
 
				+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
			
 
				+				if (i == j)
			
 
				+				{
			
 
				+					unsigned tmp;
			
 
				+					for (tmp = 0; tmp < size/nblocks; tmp++)
			
 
				+					{
			
 
				+						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				/* Register it to StarPU */
			
 
				+				starpu_matrix_data_register(handleptr, 0,
			
 
				+					(uintptr_t)*blockptr, size/nblocks,
			
 
				+					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+			}
			
 
				+			else {
			
 
				+				*blockptr = STARPU_POISON_PTR;
			
 
				+				*handleptr = STARPU_POISON_PTR;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Allocate the temporary buffers required for the distributed algorithm */
			
 
				+
			
 
				+	unsigned k;
			
 
				+
			
 
				+	/* tmp buffer 11 */
			
 
				+#ifdef SINGLE_TMP11
			
 
				+	starpu_malloc((void **)&tmp_11_block, blocksize);
			
 
				+	allocated_memory_extra += blocksize;
			
 
				+	starpu_matrix_data_register(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
			
 
				+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+#else
			
 
				+	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				+	tmp_11_block = calloc(nblocks, sizeof(TYPE *));
			
 
				+	allocated_memory_extra += nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		if (tmp_11_block_is_needed(rank, nblocks, k))
			
 
				+		{
			
 
				+			starpu_malloc((void **)&tmp_11_block[k], blocksize);
			
 
				+			allocated_memory_extra += blocksize;
			
 
				+			STARPU_ASSERT(tmp_11_block[k]);
			
 
				+
			
 
				+			starpu_matrix_data_register(&tmp_11_block_handles[k], 0,
			
 
				+				(uintptr_t)tmp_11_block[k],
			
 
				+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	/* tmp buffers 12 and 21 */
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+	tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				+	tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				+	tmp_12_block = calloc(nblocks, sizeof(TYPE *));
			
 
				+	tmp_21_block = calloc(nblocks, sizeof(TYPE *));
			
 
				+
			
 
				+	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
			
 
				+#else
			
 
				+	for (i = 0; i < 2; i++) {
			
 
				+		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				+		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				+		tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *));
			
 
				+		tmp_21_block[i] = calloc(nblocks, sizeof(TYPE *));
			
 
				+
			
 
				+		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+		if (tmp_12_block_is_needed(rank, nblocks, k))
			
 
				+		{
			
 
				+			starpu_malloc((void **)&tmp_12_block[k], blocksize);
			
 
				+			allocated_memory_extra += blocksize;
			
 
				+			STARPU_ASSERT(tmp_12_block[k]);
			
 
				+
			
 
				+			starpu_matrix_data_register(&tmp_12_block_handles[k], 0,
			
 
				+				(uintptr_t)tmp_12_block[k],
			
 
				+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+		}
			
 
				+
			
 
				+		if (tmp_21_block_is_needed(rank, nblocks, k))
			
 
				+		{
			
 
				+			starpu_malloc((void **)&tmp_21_block[k], blocksize);
			
 
				+			allocated_memory_extra += blocksize;
			
 
				+			STARPU_ASSERT(tmp_21_block[k]);
			
 
				+
			
 
				+			starpu_matrix_data_register(&tmp_21_block_handles[k], 0,
			
 
				+				(uintptr_t)tmp_21_block[k],
			
 
				+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+		}
			
 
				+#else
			
 
				+	for (i = 0; i < 2; i++) {
			
 
				+		if (tmp_12_block_is_needed(rank, nblocks, k))
			
 
				+		{
			
 
				+			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
			
 
				+			allocated_memory_extra += blocksize;
			
 
				+			STARPU_ASSERT(tmp_12_block[i][k]);
			
 
				+
			
 
				+			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
			
 
				+				(uintptr_t)tmp_12_block[i][k],
			
 
				+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+		}
			
 
				+
			
 
				+		if (tmp_21_block_is_needed(rank, nblocks, k))
			
 
				+		{
			
 
				+			starpu_malloc((void **)&tmp_21_block[i][k], blocksize);
			
 
				+			allocated_memory_extra += blocksize;
			
 
				+			STARPU_ASSERT(tmp_21_block[i][k]);
			
 
				+
			
 
				+			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
			
 
				+				(uintptr_t)tmp_21_block[i][k],
			
 
				+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	//display_all_blocks(nblocks, size/nblocks);
			
 
				+}
			
 
				+
			
 
				+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
			
 
				+{
			
 
				+	return dataA[j+i*nblocks];
			
 
				+}
			
 
				+
			
 
				+int get_block_rank(unsigned i, unsigned j)
			
 
				+{
			
 
				+	/* Take a 2D block cyclic distribution */
			
 
				+	/* NB: p (resp. q) is for "direction" i (resp. j) */
			
 
				+	return (j % q) * p + (i % p);
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
			
 
				+{
			
 
				+	return dataA_handles[j+i*nblocks];
			
 
				+}
			
 
				+
			
 
				+static void display_grid(int rank, unsigned pnblocks)
			
 
				+{
			
 
				+	if (!display)
			
 
				+		return;
			
 
				+
			
 
				+	//if (rank == 0)
			
 
				+	{
			
 
				+		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
			
 
				+
			
 
				+		unsigned i, j;
			
 
				+		for (j = 0; j < pnblocks; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < pnblocks; i++)
			
 
				+			{
			
 
				+				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
			
 
				+				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
			
 
				+
			
 
				+				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
			
 
				+			}
			
 
				+			fprintf(stderr, "\n");
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int world_size;
			
 
				+
			
 
				+	/*
			
 
				+	 *	Initialization
			
 
				+	 */
			
 
				+	int thread_support;
			
 
				+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
			
 
				+		fprintf(stderr,"MPI_Init_thread failed\n");
			
 
				+		exit(1);
			
 
				+	}
			
 
				+	if (thread_support == MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
			
 
				+	if (thread_support < MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
			
 
				+
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
			
 
				+
			
 
				+	starpu_srand48((long int)time(NULL));
			
 
				+
			
 
				+	parse_args(rank, argc, argv);
			
 
				+
			
 
				+	int ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	/* We disable sequential consistency in this example */
			
 
				+	starpu_data_set_default_sequential_consistency_flag(0);
			
 
				+
			
 
				+	starpu_mpi_init(NULL, NULL, 0);
			
 
				+
			
 
				+	STARPU_ASSERT(p*q == world_size);
			
 
				+
			
 
				+	starpu_cublas_init();
			
 
				+
			
 
				+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Problem Init
			
 
				+	 */
			
 
				+
			
 
				+	init_matrix(rank);
			
 
				+
			
 
				+	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
			
 
				+                        (int)(allocated_memory/(1024*1024)),
			
 
				+			(int)(allocated_memory_extra/(1024*1024)),
			
 
				+                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));
			
 
				+
			
 
				+	display_grid(rank, nblocks);
			
 
				+
			
 
				+	TYPE *a_r = NULL;
			
 
				+//	STARPU_PLU(display_data_content)(a_r, size);
			
 
				+
			
 
				+	TYPE *x, *y;
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		x = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(x);
			
 
				+
			
 
				+		y = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(y);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			unsigned ind;
			
 
				+			for (ind = 0; ind < size; ind++)
			
 
				+				x[ind] = (TYPE)starpu_drand48();
			
 
				+		}
			
 
				+
			
 
				+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+			STARPU_PLU(display_data_content)(a_r, size);
			
 
				+
			
 
				+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
			
 
				+	}
			
 
				+
			
 
				+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Report performance
			
 
				+	 */
			
 
				+
			
 
				+	int reduce_ret;
			
 
				+	double min_timing = timing;
			
 
				+	double max_timing = timing;
			
 
				+	double sum_timing = timing;
			
 
				+
			
 
				+	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		fprintf(stderr, "Computation took: %f ms\n", max_timing/1000);
			
 
				+		fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000);
			
 
				+		fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000);
			
 
				+		fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
			
 
				+
			
 
				+		unsigned n = size;
			
 
				+		double flop = (2.0f*n*n*n)/3.0f;
			
 
				+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f));
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 *	Test Result Correctness
			
 
				+	 */
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		/*
			
 
				+		 *	Compute || A - LU ||
			
 
				+		 */
			
 
				+
			
 
				+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
			
 
				+
			
 
				+#if 0
			
 
				+		/*
			
 
				+		 *	Compute || Ax - LUx ||
			
 
				+		 */
			
 
				+
			
 
				+		unsigned ind;
			
 
				+
			
 
				+		y2 = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(y);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			for (ind = 0; ind < size; ind++)
			
 
				+			{
			
 
				+				y2[ind] = (TYPE)0.0;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
			
 
				+
			
 
				+		/* Compute y2 = y2 - y */
			
 
				+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
			
 
				+
			
 
				+		TYPE err = CPU_ASUM(size, y2, 1);
			
 
				+		int max = CPU_IAMAX(size, y2, 1);
			
 
				+
			
 
				+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
			
 
				+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Termination
			
 
				+	 */
			
 
				+
			
 
				+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	starpu_cublas_shutdown();
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+#if 0
			
 
				+	MPI_Finalize();
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/examples/mpi_lu/plu_example_double.c
+++ b/nmad/examples/mpi_lu/plu_example_double.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-double.h"
			
 
				+#include "plu_example.c"
			
--- a/nmad/examples/mpi_lu/plu_example_float.c
+++ b/nmad/examples/mpi_lu/plu_example_float.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-float.h"
			
 
				+#include "plu_example.c"
			
--- a/nmad/examples/mpi_lu/plu_solve.c
+++ b/nmad/examples/mpi_lu/plu_solve.c
@@ -0,0 +1,393 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+#include "pxlu.h"
			
 
				+
			
 
				+/*
			
 
				+ *	Various useful functions
			
 
				+ */
			
 
				+
			
 
				+static double frobenius_norm(TYPE *v, unsigned n)
			
 
				+{
			
 
				+	double sum2 = 0.0;
			
 
				+
			
 
				+	/* compute sqrt(Sum(|x|^2)) */
			
 
				+
			
 
				+	unsigned i,j;
			
 
				+	for (j = 0; j < n; j++)
			
 
				+		for (i = 0; i < n; i++)
			
 
				+		{
			
 
				+			double a = fabsl((double)v[i+n*j]);
			
 
				+			sum2 += a*a;
			
 
				+		}
			
 
				+
			
 
				+	return sqrt(sum2);
			
 
				+}
			
 
				+
			
 
				+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
			
 
				+{
			
 
				+	if (!STARPU_PLU(display_flag)())
			
 
				+		return;
			
 
				+
			
 
				+	fprintf(stderr, "DISPLAY BLOCK\n");
			
 
				+
			
 
				+	unsigned i, j;
			
 
				+	for (j = 0; j < blocksize; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < blocksize; i++)
			
 
				+		{
			
 
				+			fprintf(stderr, "%f ", data[j+i*blocksize]);
			
 
				+		}
			
 
				+		fprintf(stderr, "\n");
			
 
				+	}
			
 
				+
			
 
				+	fprintf(stderr, "****\n");
			
 
				+}
			
 
				+
			
 
				+void STARPU_PLU(extract_upper)(unsigned block_size, TYPE *inblock, TYPE *outblock)
			
 
				+{
			
 
				+	unsigned li, lj;
			
 
				+	for (lj = 0; lj < block_size; lj++)
			
 
				+	{
			
 
				+		/* Upper block diag is 1 */
			
 
				+		outblock[lj*(block_size + 1)] = (TYPE)1.0;
			
 
				+
			
 
				+		for (li = lj + 1; li < block_size; li++)
			
 
				+		{
			
 
				+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outblock)
			
 
				+{
			
 
				+	unsigned li, lj;
			
 
				+	for (lj = 0; lj < block_size; lj++)
			
 
				+	{
			
 
				+		for (li = 0; li <= lj; li++)
			
 
				+		{
			
 
				+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Compute Ax = y
			
 
				+ */
			
 
				+
			
 
				+static void STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
			
 
				+{
			
 
				+	fprintf(stderr, "block data %p sub x %p sub y %p\n", block_data, sub_x, sub_y);
			
 
				+	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
			
 
				+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
			
 
				+{
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	/* Take a copy of the upper part of the diagonal block */
			
 
				+	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
			
 
				+	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
			
 
				+
			
 
				+	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
			
 
				+
			
 
				+	free(upper_block_copy);
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
			
 
				+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
			
 
				+{
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	/* Take a copy of the upper part of the diagonal block */
			
 
				+	TYPE *lower_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
			
 
				+	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
			
 
				+
			
 
				+	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
			
 
				+
			
 
				+	free(lower_block_copy);
			
 
				+}
			
 
				+
			
 
				+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
			
 
				+{
			
 
				+	/* Create temporary buffers where all MPI processes are going to
			
 
				+	 * compute Ui x = yi where Ai is the matrix containing the blocks of U
			
 
				+	 * affected to process i, and 0 everywhere else. We then have y as the
			
 
				+	 * sum of all yi. */
			
 
				+	TYPE *yi = calloc(size, sizeof(TYPE));
			
 
				+
			
 
				+	fprintf(stderr, "Compute LU\n");
			
 
				+
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	/* Compute UiX = Yi */
			
 
				+	unsigned long i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		if (get_block_rank(j, j) == rank)
			
 
				+		{
			
 
				+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
			
 
				+			TYPE *sub_x = &x[j*(block_size)];
			
 
				+			TYPE *sub_yi = &yi[j*(block_size)];
			
 
				+
			
 
				+			STARPU_PLU(compute_ax_block_upper)(size, nblocks, block_data, sub_x, sub_yi);
			
 
				+		}
			
 
				+
			
 
				+		for (i = j + 1; i < nblocks; i++)
			
 
				+		{
			
 
				+			if (get_block_rank(i, j) == rank)
			
 
				+			{
			
 
				+				/* That block belongs to the current MPI process */
			
 
				+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
			
 
				+				TYPE *sub_x = &x[i*(block_size)];
			
 
				+				TYPE *sub_yi = &yi[j*(block_size)];
			
 
				+
			
 
				+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Grab Sum Yi in X */
			
 
				+	MPI_Reduce(yi, x, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+	memset(yi, 0, size*sizeof(TYPE));
			
 
				+
			
 
				+//	unsigned ind;
			
 
				+//	if (rank == 0)
			
 
				+//	{
			
 
				+//		fprintf(stderr, "INTERMEDIATE\n");
			
 
				+//		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
			
 
				+//		{
			
 
				+//			fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]);
			
 
				+//		}
			
 
				+//		fprintf(stderr, "****\n");
			
 
				+//	}
			
 
				+
			
 
				+	/* Everyone needs x */
			
 
				+	int bcst_ret;
			
 
				+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	/* Compute LiX = Yi (with X = UX) */
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		if (j > 0)
			
 
				+		for (i = 0; i < j; i++)
			
 
				+		{
			
 
				+			if (get_block_rank(i, j) == rank)
			
 
				+			{
			
 
				+				/* That block belongs to the current MPI process */
			
 
				+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
			
 
				+				TYPE *sub_x = &x[i*(block_size)];
			
 
				+				TYPE *sub_yi = &yi[j*(block_size)];
			
 
				+
			
 
				+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (get_block_rank(j, j) == rank)
			
 
				+		{
			
 
				+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
			
 
				+			TYPE *sub_x = &x[j*(block_size)];
			
 
				+			TYPE *sub_yi = &yi[j*(block_size)];
			
 
				+
			
 
				+			STARPU_PLU(compute_ax_block_lower)(size, nblocks, block_data, sub_x, sub_yi);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Grab Sum Yi in Y */
			
 
				+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+
			
 
				+	free(yi);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	Allocate a contiguous matrix on node 0 and fill it with the whole
			
 
				+ *	content of the matrix distributed accross all nodes.
			
 
				+ */
			
 
				+
			
 
				+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
			
 
				+{
			
 
				+//	fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks);
			
 
				+
			
 
				+	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
			
 
				+
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+
			
 
				+	unsigned bi, bj;
			
 
				+	for (bj = 0; bj < nblocks; bj++)
			
 
				+	for (bi = 0; bi < nblocks; bi++)
			
 
				+	{
			
 
				+		TYPE *block;
			
 
				+
			
 
				+		int block_rank = get_block_rank(bi, bj);
			
 
				+
			
 
				+		if (block_rank == 0)
			
 
				+		{
			
 
				+			block = STARPU_PLU(get_block)(bi, bj);
			
 
				+		}
			
 
				+		else {
			
 
				+			MPI_Status status;
			
 
				+
			
 
				+			if (rank == 0)
			
 
				+			{
			
 
				+				block = calloc(block_size*block_size, sizeof(TYPE));
			
 
				+
			
 
				+				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status);
			
 
				+				STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				+			}
			
 
				+			else if (rank == block_rank) {
			
 
				+				block = STARPU_PLU(get_block)(bi, bj);
			
 
				+				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
			
 
				+				STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			unsigned j, i;
			
 
				+			for (j = 0; j < block_size; j++)
			
 
				+			for (i = 0; i < block_size; i++)
			
 
				+			{
			
 
				+				bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
			
 
				+									block[j+i*block_size];
			
 
				+			}
			
 
				+
			
 
				+			if (get_block_rank(bi, bj) != 0)
			
 
				+				free(block);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return bigmatrix;
			
 
				+}
			
 
				+
			
 
				+/* x and y must be valid (at least) on 0 */
			
 
				+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
			
 
				+{
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	/* Send x to everyone */
			
 
				+	int bcst_ret;
			
 
				+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	/* Create temporary buffers where all MPI processes are going to
			
 
				+	 * compute Ai x = yi where Ai is the matrix containing the blocks of A
			
 
				+	 * affected to process i, and 0 everywhere else. We then have y as the
			
 
				+	 * sum of all yi. */
			
 
				+	TYPE *yi = calloc(size, sizeof(TYPE));
			
 
				+
			
 
				+	/* Compute Aix = yi */
			
 
				+	unsigned long i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			if (get_block_rank(i, j) == rank)
			
 
				+			{
			
 
				+				/* That block belongs to the current MPI process */
			
 
				+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
			
 
				+				TYPE *sub_x = &x[i*block_size];
			
 
				+				TYPE *sub_yi = &yi[j*block_size];
			
 
				+
			
 
				+				STARPU_PLU(compute_ax_block)(block_size, block_data, sub_x, sub_yi);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Compute the Sum of all yi = y */
			
 
				+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+
			
 
				+	fprintf(stderr, "RANK %d - FOO 1 y[0] %f\n", rank, y[0]);
			
 
				+
			
 
				+	free(yi);
			
 
				+}
			
 
				+
			
 
				+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved)
			
 
				+{
			
 
				+	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				+
			
 
				+	unsigned display = STARPU_PLU(display_flag)();
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
			
 
				+		TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
			
 
				+
			
 
				+		memset(L, 0, size*size*sizeof(TYPE));
			
 
				+		memset(U, 0, size*size*sizeof(TYPE));
			
 
				+
			
 
				+		/* only keep the lower part */
			
 
				+		unsigned i, j;
			
 
				+		for (j = 0; j < size; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < j; i++)
			
 
				+			{
			
 
				+				L[j+i*size] = all_r[j+i*size];
			
 
				+			}
			
 
				+
			
 
				+			/* diag i = j */
			
 
				+			L[j+j*size] = all_r[j+j*size];
			
 
				+			U[j+j*size] = 1.0;
			
 
				+
			
 
				+			for (i = j+1; i < size; i++)
			
 
				+			{
			
 
				+				U[j+i*size] = all_r[j+i*size];
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PLU(display_data_content)(L, size);
			
 
				+		STARPU_PLU(display_data_content)(U, size);
			
 
				+
			
 
				+		/* now A_err = L, compute L*U */
			
 
				+		CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
			
 
				+
			
 
				+		if (display)
			
 
				+			fprintf(stderr, "\nLU\n");
			
 
				+
			
 
				+		STARPU_PLU(display_data_content)(L, size);
			
 
				+
			
 
				+		/* compute "LU - A" in L*/
			
 
				+		CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
			
 
				+
			
 
				+		TYPE err = CPU_ASUM(size*size, L, 1);
			
 
				+		int max = CPU_IAMAX(size*size, L, 1);
			
 
				+
			
 
				+		if (display)
			
 
				+			fprintf(stderr, "DISPLAY ERROR\n");
			
 
				+
			
 
				+		STARPU_PLU(display_data_content)(L, size);
			
 
				+
			
 
				+		fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
			
 
				+		fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
			
 
				+
			
 
				+		double residual = frobenius_norm(L, size);
			
 
				+		double matnorm = frobenius_norm(Asaved, size);
			
 
				+
			
 
				+		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
			
 
				+	}
			
 
				+}
			
--- a/nmad/examples/mpi_lu/plu_solve_double.c
+++ b/nmad/examples/mpi_lu/plu_solve_double.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-double.h"
			
 
				+#include "plu_solve.c"
			
--- a/nmad/examples/mpi_lu/plu_solve_float.c
+++ b/nmad/examples/mpi_lu/plu_solve_float.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-float.h"
			
 
				+#include "plu_solve.c"
			
--- a/nmad/examples/mpi_lu/pslu.c
+++ b/nmad/examples/mpi_lu/pslu.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-float.h"
			
 
				+#include "pxlu.c"
			
--- a/nmad/examples/mpi_lu/pslu_kernels.c
+++ b/nmad/examples/mpi_lu/pslu_kernels.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-float.h"
			
 
				+#include "pxlu_kernels.c"
			
--- a/nmad/examples/mpi_lu/pxlu.c
+++ b/nmad/examples/mpi_lu/pxlu.c
@@ -0,0 +1,870 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+#include "pxlu_kernels.h"
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+#define MPI_TAG11(k)	((1U << 16) | (k))
			
 
				+#define MPI_TAG12(k, j)	((2U << 16) | (k)<<8 | (j))
			
 
				+#define MPI_TAG21(k, i)	((3U << 16) | (i)<<8 | (k))
			
 
				+
			
 
				+// 11 21
			
 
				+// 12 22
			
 
				+
			
 
				+#define TAG11(k)	((starpu_tag_t)( (1ULL<<50) | (unsigned long long)(k)))
			
 
				+#define TAG12(k,j)	((starpu_tag_t)(((2ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG21(k,i)	((starpu_tag_t)(((3ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(i))))
			
 
				+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<50) | ((unsigned long long)(k)<<32) 	\
			
 
				+					| ((unsigned long long)(i)<<16)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG11_SAVE(k)	((starpu_tag_t)( (5ULL<<50) | (unsigned long long)(k)))
			
 
				+#define TAG12_SAVE(k,j)	((starpu_tag_t)(((6ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG21_SAVE(k,i)	((starpu_tag_t)(((7ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(i))))
			
 
				+
			
 
				+#define TAG11_SAVE_PARTIAL(k)	((starpu_tag_t)( (8ULL<<50) | (unsigned long long)(k)))
			
 
				+#define TAG12_SAVE_PARTIAL(k,j)	((starpu_tag_t)(((9ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG21_SAVE_PARTIAL(k,i)	((starpu_tag_t)(((10ULL<<50) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(i))))
			
 
				+
			
 
				+#define STARPU_TAG_INIT	((starpu_tag_t)(11ULL<<50))
			
 
				+
			
 
				+//#define VERBOSE_INIT	1
			
 
				+
			
 
				+//#define DEBUG	1
			
 
				+
			
 
				+static unsigned no_prio = 0;
			
 
				+
			
 
				+static unsigned nblocks = 0;
			
 
				+static int rank = -1;
			
 
				+static int world_size = -1;
			
 
				+
			
 
				+struct callback_arg {
			
 
				+	unsigned i, j, k;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *	Various
			
 
				+ */
			
 
				+
			
 
				+static struct debug_info *create_debug_info(unsigned i, unsigned j, unsigned k)
			
 
				+{
			
 
				+	struct debug_info *info = malloc(sizeof(struct debug_info));
			
 
				+
			
 
				+	info->i = i;
			
 
				+	info->j = j;
			
 
				+	info->k = k;
			
 
				+
			
 
				+	return info;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_task *create_task(starpu_tag_t id)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+		task->cl_arg = NULL;
			
 
				+
			
 
				+	task->use_tag = 1;
			
 
				+	task->tag_id = id;
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+/* Send handle to every node appearing in the mask, and unlock tag once the
			
 
				+ * transfers are done. */
			
 
				+static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
			
 
				+{
			
 
				+	unsigned cnt = 0;
			
 
				+
			
 
				+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
			
 
				+
			
 
				+	int rank_array[world_size];
			
 
				+	MPI_Comm comm_array[world_size];
			
 
				+	int mpi_tag_array[world_size];
			
 
				+	starpu_data_handle_t handle_array[world_size];
			
 
				+
			
 
				+	int r;
			
 
				+	for (r = 0; r < world_size; r++)
			
 
				+	{
			
 
				+		if (rank_mask[r]) {
			
 
				+			rank_array[cnt] = r;
			
 
				+
			
 
				+			comm_array[cnt] = MPI_COMM_WORLD;
			
 
				+			mpi_tag_array[cnt] = mpi_tag;
			
 
				+			handle_array[cnt] = handle;
			
 
				+			cnt++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (cnt == 0)
			
 
				+	{
			
 
				+		/* In case there is no message to send, we release the tag at
			
 
				+		 * once */
			
 
				+		starpu_tag_notify_from_apps(tag);
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
			
 
				+				rank_array, mpi_tag_array, comm_array, tag);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Initiate a receive request once all dependencies are fulfilled and unlock
			
 
				+ * tag 'unlocked_tag' once it's done. */
			
 
				+
			
 
				+struct recv_when_done_callback_arg {
			
 
				+	int source;
			
 
				+	int mpi_tag;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	starpu_tag_t unlocked_tag;
			
 
				+};
			
 
				+
			
 
				+static void callback_receive_when_done(void *_arg)
			
 
				+{
			
 
				+	struct recv_when_done_callback_arg *arg = _arg;
			
 
				+
			
 
				+	starpu_mpi_irecv_detached_unlock_tag(arg->handle, arg->source,
			
 
				+			arg->mpi_tag, MPI_COMM_WORLD, arg->unlocked_tag);
			
 
				+
			
 
				+	free(arg);
			
 
				+}
			
 
				+
			
 
				+static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
			
 
				+				int source, int mpi_tag,
			
 
				+				starpu_data_handle_t handle,
			
 
				+				starpu_tag_t partial_tag,
			
 
				+				starpu_tag_t unlocked_tag)
			
 
				+{
			
 
				+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
			
 
				+
			
 
				+	struct recv_when_done_callback_arg *arg =
			
 
				+		malloc(sizeof(struct recv_when_done_callback_arg));
			
 
				+	
			
 
				+	arg->source = source;
			
 
				+	arg->mpi_tag = mpi_tag;
			
 
				+	arg->handle = handle;
			
 
				+	arg->unlocked_tag = unlocked_tag;
			
 
				+
			
 
				+	if (ndeps == 0)
			
 
				+	{
			
 
				+		callback_receive_when_done(arg);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	starpu_create_sync_task(partial_tag, ndeps, deps_tags,
			
 
				+					callback_receive_when_done, arg);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 11 (diagonal factorization)
			
 
				+ */
			
 
				+
			
 
				+static void create_task_11_recv(unsigned k)
			
 
				+{
			
 
				+	/* The current node is not computing that task, so we receive the block
			
 
				+	 * with MPI */
			
 
				+
			
 
				+	/* We don't issue a MPI receive request until everyone using the
			
 
				+	 * temporary buffer is done : 11_(k-1) can be used by 12_(k-1)j and
			
 
				+	 * 21(k-1)i with i,j >= k */
			
 
				+	unsigned ndeps = 0;
			
 
				+	starpu_tag_t tag_array[2*nblocks];
			
 
				+	
			
 
				+#ifdef SINGLE_TMP11
			
 
				+	unsigned i, j;
			
 
				+	if (k > 0)
			
 
				+	for (i = (k-1)+1; i < nblocks; i++)
			
 
				+	{
			
 
				+		if (rank == get_block_rank(i, k-1))
			
 
				+			tag_array[ndeps++] = TAG21(k-1, i);
			
 
				+	}
			
 
				+
			
 
				+	if (k > 0)
			
 
				+	for (j = (k-1)+1; j < nblocks; j++)
			
 
				+	{
			
 
				+		if (rank == get_block_rank(k-1, j))
			
 
				+			tag_array[ndeps++] = TAG12(k-1, j);
			
 
				+	}
			
 
				+#endif
			
 
				+	
			
 
				+	int source = get_block_rank(k, k);
			
 
				+#ifdef SINGLE_TMP11
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)();
			
 
				+#else
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
			
 
				+#endif
			
 
				+	int mpi_tag = MPI_TAG11(k);
			
 
				+	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
			
 
				+	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
			
 
				+
			
 
				+//	fprintf(stderr, "NODE %d - 11 (%d) - recv when done ndeps %d - tag array %lx\n", rank, k, ndeps, tag_array[0]);
			
 
				+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
			
 
				+}
			
 
				+
			
 
				+static void find_nodes_using_11(unsigned k, int *rank_mask)
			
 
				+{
			
 
				+	memset(rank_mask, 0, world_size*sizeof(int));
			
 
				+
			
 
				+	/* Block 11_k is used to compute 12_kj + 12ki with i,j > k */
			
 
				+	unsigned i;
			
 
				+	for (i = k+1; i < nblocks; i++)
			
 
				+	{
			
 
				+		int r = get_block_rank(i, k);
			
 
				+		rank_mask[r] = 1;
			
 
				+	}
			
 
				+
			
 
				+	unsigned j;
			
 
				+	for (j = k+1; j < nblocks; j++)
			
 
				+	{
			
 
				+		int r = get_block_rank(k, j);
			
 
				+		rank_mask[r] = 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void callback_task_11_real(void *_arg)
			
 
				+{
			
 
				+	struct callback_arg *arg = _arg;
			
 
				+
			
 
				+	unsigned k = arg->k;
			
 
				+
			
 
				+	/* Find all the nodes potentially requiring this block */
			
 
				+	int rank_mask[world_size];
			
 
				+	find_nodes_using_11(k, rank_mask);
			
 
				+	rank_mask[rank] = 0;
			
 
				+
			
 
				+	/* Send the block to those nodes */
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, k);
			
 
				+	starpu_tag_t tag = TAG11_SAVE(k);
			
 
				+	int mpi_tag = MPI_TAG11(k);
			
 
				+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				+	
			
 
				+	free(arg);
			
 
				+}
			
 
				+
			
 
				+static void create_task_11_real(unsigned k)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG11(k));
			
 
				+
			
 
				+	task->cl = &STARPU_PLU(cl11);
			
 
				+
			
 
				+	task->cl_arg = create_debug_info(k, k, k);
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	task->handles[0] = STARPU_PLU(get_block_handle)(k, k);
			
 
				+
			
 
				+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
			
 
				+		arg->k = k;
			
 
				+
			
 
				+	task->callback_func = callback_task_11_real;
			
 
				+	task->callback_arg = arg;
			
 
				+
			
 
				+	/* this is an important task */
			
 
				+	if (!no_prio)
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
			
 
				+	}
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+}
			
 
				+
			
 
				+static void create_task_11(unsigned k)
			
 
				+{
			
 
				+	if (get_block_rank(k, k) == rank)
			
 
				+	{
			
 
				+#ifdef VERBOSE_INIT
			
 
				+		fprintf(stderr, "CREATE real task 11(%d) (TAG11_SAVE(%d) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
			
 
				+#endif
			
 
				+		create_task_11_real(k);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				+		int rank_mask[world_size];
			
 
				+		find_nodes_using_11(k, rank_mask);
			
 
				+		
			
 
				+		if (rank_mask[rank])
			
 
				+		{
			
 
				+#ifdef VERBOSE_INIT
			
 
				+			fprintf(stderr, "create RECV task 11(%d) on node %d\n", k, rank);
			
 
				+#endif
			
 
				+			create_task_11_recv(k);
			
 
				+		}
			
 
				+		else {
			
 
				+#ifdef VERBOSE_INIT
			
 
				+			fprintf(stderr, "Node %d needs not 11(%d)\n", rank, k);
			
 
				+#endif
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	Task 12 (Update lower left (TRSM))
			
 
				+ */
			
 
				+
			
 
				+static void create_task_12_recv(unsigned k, unsigned j)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	/* The current node is not computing that task, so we receive the block
			
 
				+	 * with MPI */
			
 
				+
			
 
				+	/* We don't issue a MPI receive request until everyone using the
			
 
				+	 * temporary buffer is done : 12_(k-1)j can be used by 22_(k-1)ij with
			
 
				+	 * i >= k */
			
 
				+	unsigned ndeps = 0;
			
 
				+	starpu_tag_t tag_array[nblocks];
			
 
				+	
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+	if (k > 0)
			
 
				+	for (i = (k-1)+1; i < nblocks; i++)
			
 
				+#else
			
 
				+	if (k > 1)
			
 
				+	for (i = (k-2)+1; i < nblocks; i++)
			
 
				+#endif
			
 
				+	{
			
 
				+		if (rank == get_block_rank(i, j))
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+			tag_array[ndeps++] = TAG22(k-1, i, j);
			
 
				+#else
			
 
				+			tag_array[ndeps++] = TAG22(k-2, i, j);
			
 
				+#endif
			
 
				+	}
			
 
				+	
			
 
				+	int source = get_block_rank(k, j);
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
			
 
				+#else
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j,k);
			
 
				+#endif
			
 
				+	int mpi_tag = MPI_TAG12(k, j);
			
 
				+	starpu_tag_t partial_tag = TAG12_SAVE_PARTIAL(k, j);
			
 
				+	starpu_tag_t unlocked_tag = TAG12_SAVE(k, j);
			
 
				+
			
 
				+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
			
 
				+}
			
 
				+
			
 
				+static void find_nodes_using_12(unsigned k, unsigned j, int *rank_mask)
			
 
				+{
			
 
				+	memset(rank_mask, 0, world_size*sizeof(int));
			
 
				+
			
 
				+	/* Block 12_kj is used to compute 22_kij with i > k */
			
 
				+	unsigned i;
			
 
				+	for (i = k+1; i < nblocks; i++)
			
 
				+	{
			
 
				+		int r = get_block_rank(i, j);
			
 
				+		rank_mask[r] = 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void callback_task_12_real(void *_arg)
			
 
				+{
			
 
				+	struct callback_arg *arg = _arg;
			
 
				+
			
 
				+	unsigned k = arg->k;
			
 
				+	unsigned j = arg->j;
			
 
				+
			
 
				+	/* Find all the nodes potentially requiring this block */
			
 
				+	int rank_mask[world_size];
			
 
				+	find_nodes_using_12(k, j, rank_mask);
			
 
				+	rank_mask[rank] = 0;
			
 
				+
			
 
				+	/* Send the block to those nodes */
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, j);
			
 
				+	starpu_tag_t tag = TAG12_SAVE(k, j);
			
 
				+	int mpi_tag = MPI_TAG12(k, j);
			
 
				+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				+	
			
 
				+	free(arg);
			
 
				+}
			
 
				+
			
 
				+static void create_task_12_real(unsigned k, unsigned j)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG12(k, j));
			
 
				+	
			
 
				+#warning temporary fix :/
			
 
				+//	task->cl = &STARPU_PLU(cl12);
			
 
				+	task->cl = &STARPU_PLU(cl21);
			
 
				+
			
 
				+	task->cl_arg = create_debug_info(j, j, k);
			
 
				+
			
 
				+	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
			
 
				+
			
 
				+	starpu_tag_t tag_11_dep; 
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	starpu_data_handle_t diag_block;
			
 
				+	if (diag_block_is_local)
			
 
				+	{
			
 
				+		diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				+		tag_11_dep = TAG11(k);
			
 
				+	}
			
 
				+	else 
			
 
				+	{
			
 
				+#ifdef SINGLE_TMP11
			
 
				+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
			
 
				+#else
			
 
				+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
			
 
				+#endif
			
 
				+		tag_11_dep = TAG11_SAVE(k);
			
 
				+	}
			
 
				+
			
 
				+	task->handles[0] = diag_block; 
			
 
				+	task->handles[1] = STARPU_PLU(get_block_handle)(k, j); 
			
 
				+
			
 
				+	STARPU_ASSERT(get_block_rank(k, j) == rank);
			
 
				+
			
 
				+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
			
 
				+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
			
 
				+
			
 
				+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
			
 
				+		arg->j = j;
			
 
				+		arg->k = k;
			
 
				+
			
 
				+	task->callback_func = callback_task_12_real;
			
 
				+	task->callback_arg = arg;
			
 
				+
			
 
				+	if (!no_prio && (j == k+1)) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG12(k, j), 2, tag_11_dep, TAG22(k-1, k, j));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG12(k, j), 1, tag_11_dep);
			
 
				+	}
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+}
			
 
				+
			
 
				+static void create_task_12(unsigned k, unsigned j)
			
 
				+{
			
 
				+	if (get_block_rank(k, j) == rank)
			
 
				+	{
			
 
				+#ifdef VERBOSE_INIT
			
 
				+		fprintf(stderr, "CREATE real task 12(k = %d, j = %d) on node %d\n", k, j, rank);
			
 
				+#endif
			
 
				+		create_task_12_real(k, j);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				+		int rank_mask[world_size];
			
 
				+		find_nodes_using_12(k, j, rank_mask);
			
 
				+		
			
 
				+		if (rank_mask[rank])
			
 
				+		{
			
 
				+#ifdef VERBOSE_INIT
			
 
				+			fprintf(stderr, "create RECV task 12(k = %d, j = %d) on node %d\n", k, j, rank);
			
 
				+#endif
			
 
				+			create_task_12_recv(k, j);
			
 
				+		}
			
 
				+		else {
			
 
				+#ifdef VERBOSE_INIT
			
 
				+			fprintf(stderr, "Node %d needs not 12(k=%d, i=%d)\n", rank, k, j);
			
 
				+#endif
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 21 (Update upper right (TRSM))
			
 
				+ */
			
 
				+
			
 
				+static void create_task_21_recv(unsigned k, unsigned i)
			
 
				+{
			
 
				+	unsigned j;
			
 
				+
			
 
				+	/* The current node is not computing that task, so we receive the block
			
 
				+	 * with MPI */
			
 
				+
			
 
				+	/* We don't issue a MPI receive request until everyone using the
			
 
				+	 * temporary buffer is done : 21_(k-1)i can be used by 22_(k-1)ij with
			
 
				+	 * j >= k */
			
 
				+	unsigned ndeps = 0;
			
 
				+	starpu_tag_t tag_array[nblocks];
			
 
				+	
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+	if (k > 0)
			
 
				+	for (j = (k-1)+1; j < nblocks; j++)
			
 
				+#else
			
 
				+	if (k > 1)
			
 
				+	for (j = (k-2)+1; j < nblocks; j++)
			
 
				+#endif
			
 
				+	{
			
 
				+		if (rank == get_block_rank(i, j))
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+			tag_array[ndeps++] = TAG22(k-1, i, j);
			
 
				+#else
			
 
				+			tag_array[ndeps++] = TAG22(k-2, i, j);
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	int source = get_block_rank(i, k);
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i);
			
 
				+#else
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i, k);
			
 
				+#endif
			
 
				+	int mpi_tag = MPI_TAG21(k, i);
			
 
				+	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
			
 
				+	starpu_tag_t unlocked_tag = TAG21_SAVE(k, i);
			
 
				+
			
 
				+//	fprintf(stderr, "NODE %d - 21 (%d, %d) - recv when done ndeps %d - tag array %lx\n", rank, k, i, ndeps, tag_array[0]);
			
 
				+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
			
 
				+}
			
 
				+
			
 
				+static void find_nodes_using_21(unsigned k, unsigned i, int *rank_mask)
			
 
				+{
			
 
				+	memset(rank_mask, 0, world_size*sizeof(int));
			
 
				+
			
 
				+	/* Block 21_ki is used to compute 22_kij with j > k */
			
 
				+	unsigned j;
			
 
				+	for (j = k+1; j < nblocks; j++)
			
 
				+	{
			
 
				+		int r = get_block_rank(i, j);
			
 
				+		rank_mask[r] = 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void callback_task_21_real(void *_arg)
			
 
				+{
			
 
				+	struct callback_arg *arg = _arg;
			
 
				+
			
 
				+	unsigned k = arg->k;
			
 
				+	unsigned i = arg->i;
			
 
				+
			
 
				+	/* Find all the nodes potentially requiring this block */
			
 
				+	int rank_mask[world_size];
			
 
				+	find_nodes_using_21(k, i, rank_mask);
			
 
				+	rank_mask[rank] = 0;
			
 
				+
			
 
				+	/* Send the block to those nodes */
			
 
				+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(i, k);
			
 
				+	starpu_tag_t tag = TAG21_SAVE(k, i);
			
 
				+	int mpi_tag = MPI_TAG21(k, i);
			
 
				+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				+	
			
 
				+	free(arg);
			
 
				+}
			
 
				+
			
 
				+static void create_task_21_real(unsigned k, unsigned i)
			
 
				+{
			
 
				+	struct starpu_task *task = create_task(TAG21(k, i));
			
 
				+
			
 
				+#warning temporary fix 
			
 
				+//	task->cl = &STARPU_PLU(cl21);
			
 
				+	task->cl = &STARPU_PLU(cl12);
			
 
				+
			
 
				+	task->cl_arg = create_debug_info(i, i, k);
			
 
				+
			
 
				+	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
			
 
				+
			
 
				+	starpu_tag_t tag_11_dep; 
			
 
				+	
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+	starpu_data_handle_t diag_block;
			
 
				+	if (diag_block_is_local)
			
 
				+	{
			
 
				+		diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				+		tag_11_dep = TAG11(k);
			
 
				+	}
			
 
				+	else 
			
 
				+	{
			
 
				+#ifdef SINGLE_TMP11
			
 
				+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
			
 
				+#else
			
 
				+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
			
 
				+#endif
			
 
				+		tag_11_dep = TAG11_SAVE(k);
			
 
				+	}
			
 
				+
			
 
				+	task->handles[0] = diag_block; 
			
 
				+	task->handles[1] = STARPU_PLU(get_block_handle)(i, k);
			
 
				+
			
 
				+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
			
 
				+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
			
 
				+
			
 
				+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
			
 
				+		arg->i = i;
			
 
				+		arg->k = k;
			
 
				+
			
 
				+	task->callback_func = callback_task_21_real;
			
 
				+	task->callback_arg = arg;
			
 
				+
			
 
				+	if (!no_prio && (i == k+1)) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG21(k, i), 2, tag_11_dep, TAG22(k-1, i, k));
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG21(k, i), 1, tag_11_dep);
			
 
				+	}
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+}
			
 
				+
			
 
				+static void create_task_21(unsigned k, unsigned i)
			
 
				+{
			
 
				+	if (get_block_rank(i, k) == rank)
			
 
				+	{
			
 
				+#ifdef VERBOSE_INIT
			
 
				+		fprintf(stderr, "CREATE real task 21(k = %d, i = %d) on node %d\n", k, i, rank);
			
 
				+#endif
			
 
				+		create_task_21_real(k, i);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				+		int rank_mask[world_size];
			
 
				+		find_nodes_using_21(k, i, rank_mask);
			
 
				+		
			
 
				+		if (rank_mask[rank])
			
 
				+		{
			
 
				+#ifdef VERBOSE_INIT
			
 
				+			fprintf(stderr, "create RECV task 21(k = %d, i = %d) on node %d\n", k, i, rank);
			
 
				+#endif
			
 
				+			create_task_21_recv(k, i);
			
 
				+		}
			
 
				+		else {
			
 
				+#ifdef VERBOSE_INIT
			
 
				+			fprintf(stderr, "Node %d needs not 21(k=%d, i=%d)\n", rank, k,i);
			
 
				+#endif
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 22 (GEMM)
			
 
				+ */
			
 
				+
			
 
				+static void create_task_22_real(unsigned k, unsigned i, unsigned j)
			
 
				+{
			
 
				+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
			
 
				+
			
 
				+	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				+
			
 
				+	task->cl = &STARPU_PLU(cl22);
			
 
				+
			
 
				+	task->cl_arg = create_debug_info(i, j, k);
			
 
				+
			
 
				+	/* which sub-data is manipulated ? */
			
 
				+
			
 
				+	/* produced by TAG21_SAVE(k, i) */ 
			
 
				+	unsigned block21_is_local = (get_block_rank(i, k) == rank);
			
 
				+	starpu_tag_t tag_21_dep;
			
 
				+
			
 
				+	starpu_data_handle_t block21;
			
 
				+	if (block21_is_local)
			
 
				+	{
			
 
				+		block21 = STARPU_PLU(get_block_handle)(i, k);
			
 
				+		tag_21_dep = TAG21(k, i);
			
 
				+	}
			
 
				+	else 
			
 
				+	{
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
			
 
				+#else
			
 
				+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i, k);
			
 
				+#endif
			
 
				+		tag_21_dep = TAG21_SAVE(k, i);
			
 
				+	}
			
 
				+
			
 
				+	/* produced by TAG12_SAVE(k, j) */
			
 
				+	unsigned block12_is_local = (get_block_rank(k, j) == rank);
			
 
				+	starpu_tag_t tag_12_dep;
			
 
				+
			
 
				+	starpu_data_handle_t block12;
			
 
				+	if (block12_is_local)
			
 
				+	{
			
 
				+	//	block12 = STARPU_PLU(get_block_handle)(j, k);
			
 
				+		block12 = STARPU_PLU(get_block_handle)(k, j);
			
 
				+		tag_12_dep = TAG12(k, j);
			
 
				+	}
			
 
				+	else 
			
 
				+	{
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
			
 
				+#else
			
 
				+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j, k);
			
 
				+#endif
			
 
				+		tag_12_dep = TAG12_SAVE(k, j);
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#warning temporary fix :/
			
 
				+	//task->handles[0] = block21;
			
 
				+	task->handles[0] = block12;
			
 
				+
			
 
				+	//task->handles[1] = block12;
			
 
				+	task->handles[1] = block21;
			
 
				+
			
 
				+	/* produced by TAG22(k-1, i, j) */
			
 
				+	task->handles[2] = STARPU_PLU(get_block_handle)(i, j);
			
 
				+
			
 
				+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
			
 
				+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
			
 
				+	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
			
 
				+
			
 
				+	if (!no_prio && (i == k + 1) && (j == k +1) ) {
			
 
				+		task->priority = STARPU_MAX_PRIO;
			
 
				+	}
			
 
				+
			
 
				+	/* enforce dependencies ... */
			
 
				+	if (k > 0) {
			
 
				+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), tag_12_dep, tag_21_dep);
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_tag_declare_deps(TAG22(k, i, j), 2, tag_12_dep, tag_21_dep);
			
 
				+	}
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+}
			
 
				+
			
 
				+static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				+{
			
 
				+	if (get_block_rank(i, j) == rank)
			
 
				+	{
			
 
				+	//	fprintf(stderr, "CREATE real task 22(k = %d, i = %d, j = %d) on node %d\n", k, i, j, rank);
			
 
				+		create_task_22_real(k, i, j);
			
 
				+	}
			
 
				+//	else {
			
 
				+//		fprintf(stderr, "Node %d needs not 22(k=%d, i=%d, j = %d)\n", rank, k,i,j);
			
 
				+//	}
			
 
				+}
			
 
				+
			
 
				+static void wait_tag_and_fetch_handle(starpu_tag_t tag, starpu_data_handle_t handle)
			
 
				+{
			
 
				+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
			
 
				+
			
 
				+	starpu_tag_wait(tag);
			
 
				+//	fprintf(stderr, "Rank %d : tag %lx is done\n", rank, tag);
			
 
				+
			
 
				+	starpu_data_acquire(handle, STARPU_R);
			
 
				+
			
 
				+//	starpu_data_unregister(handle);
			
 
				+}
			
 
				+
			
 
				+static void wait_termination(void)
			
 
				+{
			
 
				+	unsigned k, i, j;
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		/* Wait task 11k if needed */
			
 
				+		if (get_block_rank(k, k) == rank)
			
 
				+		{
			
 
				+			starpu_data_handle_t diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				+			wait_tag_and_fetch_handle(TAG11_SAVE(k), diag_block);
			
 
				+		}
			
 
				+		
			
 
				+
			
 
				+		for (i = k + 1; i < nblocks; i++)
			
 
				+		{
			
 
				+			/* Wait task 21ki if needed */
			
 
				+			if (get_block_rank(i, k) == rank)
			
 
				+			{
			
 
				+				starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(i, k);
			
 
				+				//starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(k, i);
			
 
				+				//fprintf(stderr, "BLOCK21 i %d k %d -> handle %p\n", i, k, block21);
			
 
				+				wait_tag_and_fetch_handle(TAG21_SAVE(k, i), block21);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		for (j = k + 1; j < nblocks; j++)
			
 
				+		{
			
 
				+			/* Wait task 12kj if needed */
			
 
				+			if (get_block_rank(k, j) == rank)
			
 
				+			{
			
 
				+				//starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(j, k);
			
 
				+				starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(k, j);
			
 
				+				//fprintf(stderr, "BLOCK12 j %d k %d -> handle %p\n", j, k, block12);
			
 
				+				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
			
 
				+			}
			
 
				+		}
			
 
				+	}	
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization 
			
 
				+ */
			
 
				+
			
 
				+double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
			
 
				+{
			
 
				+	double start;
			
 
				+	double end;
			
 
				+
			
 
				+	nblocks = _nblocks;
			
 
				+	rank = _rank;
			
 
				+	world_size = _world_size;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		create_task_11(k);
			
 
				+
			
 
				+		for (i = k+1; i<nblocks; i++)
			
 
				+		{
			
 
				+			create_task_12(k, i);
			
 
				+			create_task_21(k, i);
			
 
				+		}
			
 
				+
			
 
				+		for (i = k+1; i<nblocks; i++)
			
 
				+		{
			
 
				+			for (j = k+1; j<nblocks; j++)
			
 
				+			{
			
 
				+				create_task_22(k, i, j);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	/* schedule the codelet */
			
 
				+	start = starpu_timing_now();
			
 
				+
			
 
				+	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
			
 
				+
			
 
				+	wait_termination();
			
 
				+	
			
 
				+	end = starpu_timing_now();
			
 
				+
			
 
				+	double timing = end - start;
			
 
				+	
			
 
				+//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
			
 
				+	
			
 
				+	return timing;
			
 
				+}
			
--- a/nmad/examples/mpi_lu/pxlu.h
+++ b/nmad/examples/mpi_lu/pxlu.h
@@ -0,0 +1,68 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PXLU_H__
			
 
				+#define __PXLU_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <common/blas.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cublas.h>
			
 
				+#endif
			
 
				+
			
 
				+#define BLAS3_FLOP(n1,n2,n3)    \
			
 
				+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
			
 
				+
			
 
				+//#define SINGLE_TMP11	1
			
 
				+//#define SINGLE_TMP1221	1
			
 
				+
			
 
				+struct debug_info {
			
 
				+	unsigned i;
			
 
				+	unsigned j;
			
 
				+	unsigned k;
			
 
				+};
			
 
				+
			
 
				+double STARPU_PLU(plu_main)(unsigned nblocks, int rank, int world_size);
			
 
				+
			
 
				+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks);
			
 
				+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved);
			
 
				+
			
 
				+unsigned STARPU_PLU(display_flag)(void);
			
 
				+
			
 
				+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
			
 
				+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
			
 
				+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j);
			
 
				+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j);
			
 
				+#ifdef SINGLE_TMP11
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void);
			
 
				+#else
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k);
			
 
				+#endif
			
 
				+#ifdef SINGLE_TMP1221
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
			
 
				+#else
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k);
			
 
				+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k);
			
 
				+#endif
			
 
				+
			
 
				+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize);
			
 
				+
			
 
				+int get_block_rank(unsigned i, unsigned j);
			
 
				+
			
 
				+#endif // __PXLU_H__
			
--- a/nmad/examples/mpi_lu/pxlu_kernels.c
+++ b/nmad/examples/mpi_lu/pxlu_kernels.c
@@ -0,0 +1,442 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+#include "pxlu_kernels.h"
			
 
				+#include <math.h>
			
 
				+
			
 
				+///#define VERBOSE_KERNELS	1
			
 
				+
			
 
				+/*
			
 
				+ * U22
			
 
				+ */
			
 
				+
			
 
				+static inline void STARPU_PLU(common_u22)(void *descr[],
			
 
				+				int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	TYPE *right 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	TYPE *left 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+	TYPE *center 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				+
			
 
				+	unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				+	unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+
			
 
				+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				+
			
 
				+#ifdef VERBOSE_KERNELS
			
 
				+	struct debug_info *info = _args;
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d\n", rank, info->k, info->i, info->j);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			CPU_GEMM("N", "N", dy, dx, dz,
			
 
				+				(TYPE)-1.0, right, ld21, left, ld12,
			
 
				+				(TYPE)1.0, center, ld22);
			
 
				+			break;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+			CUBLAS_GEMM('n', 'n', dx, dy, dz,
			
 
				+				(TYPE)-1.0, right, ld21, left, ld12,
			
 
				+				(TYPE)1.0f, center, ld22);
			
 
				+
			
 
				+			status = cublasGetError();
			
 
				+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
 
				+				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+
			
 
				+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+#ifdef VERBOSE_KERNELS
			
 
				+	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d done\n", rank, info->k, info->i, info->j);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(cpu_u22)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u22)(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u22)(descr, 1, _args);
			
 
				+}
			
 
				+#endif// STARPU_USE_CUDA
			
 
				+
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_22) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+#ifdef STARPU_ATLAS
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
			
 
				+#elif defined(STARPU_GOTO)
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_22_goto)
			
 
				+#else
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_22)
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet STARPU_PLU(cl22) = {
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_funcs = {STARPU_PLU(cpu_u22)},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPU_PLU(cublas_u22)},
			
 
				+#endif
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				+	.model = &STARPU_PLU(model_22)
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * U12
			
 
				+ */
			
 
				+
			
 
				+static inline void STARPU_PLU(common_u12)(void *descr[],
			
 
				+				int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	TYPE *sub11;
			
 
				+	TYPE *sub12;
			
 
				+
			
 
				+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+
			
 
				+	unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+	unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				+
			
 
				+#ifdef VERBOSE_KERNELS
			
 
				+	struct debug_info *info = _args;
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+#warning fixed debugging according to other tweak
			
 
				+	//fprintf(stderr, "KERNEL 12 %d - k = %d i %d\n", rank, info->k, info->i);
			
 
				+	fprintf(stderr, "KERNEL 21 %d - k = %d i %d\n", rank, info->k, info->j);
			
 
				+
			
 
				+	//fprintf(stderr, "INPUT 12 U11\n");
			
 
				+	fprintf(stderr, "INPUT 21 U11\n");
			
 
				+	STARPU_PLU(display_data_content)(sub11, nx12);
			
 
				+	//fprintf(stderr, "INPUT 12 U12\n");
			
 
				+	fprintf(stderr, "INPUT 21 U21\n");
			
 
				+	STARPU_PLU(display_data_content)(sub12, nx12);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				+
			
 
				+	/* solve L11 U12 = A12 (find U12) */
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
			
 
				+					(TYPE)1.0, sub11, ld11, sub12, ld12);
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
			
 
				+					(TYPE)1.0, sub11, ld11, sub12, ld12);
			
 
				+
			
 
				+			status = cublasGetError();
			
 
				+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
 
				+				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+
			
 
				+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+#ifdef VERBOSE_KERNELS
			
 
				+	//fprintf(stderr, "OUTPUT 12 U12\n");
			
 
				+	fprintf(stderr, "OUTPUT 21 U21\n");
			
 
				+	STARPU_PLU(display_data_content)(sub12, nx12);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(cpu_u12)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u12)(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u12)(descr, 1, _args);
			
 
				+}
			
 
				+#endif // STARPU_USE_CUDA
			
 
				+
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_12) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+#ifdef STARPU_ATLAS
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
			
 
				+#elif defined(STARPU_GOTO)
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_12_goto)
			
 
				+#else
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_12)
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet STARPU_PLU(cl12) = {
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_funcs = {STARPU_PLU(cpu_u12)},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPU_PLU(cublas_u12)},
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW},
			
 
				+	.model = &STARPU_PLU(model_12)
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * U21
			
 
				+ */
			
 
				+
			
 
				+static inline void STARPU_PLU(common_u21)(void *descr[],
			
 
				+				int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	TYPE *sub11;
			
 
				+	TYPE *sub21;
			
 
				+
			
 
				+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	sub21 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				+
			
 
				+	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				+	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				+
			
 
				+#ifdef VERBOSE_KERNELS
			
 
				+	struct debug_info *info = _args;
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+#warning fixed debugging according to other tweak
			
 
				+	//fprintf(stderr, "KERNEL 21 %d (k = %d, i = %d)\n", rank, info->k, info->i);
			
 
				+	fprintf(stderr, "KERNEL 12 %d (k = %d, j = %d)\n", rank, info->k, info->j);
			
 
				+
			
 
				+	//fprintf(stderr, "INPUT 21 U11\n");
			
 
				+	fprintf(stderr, "INPUT 12 U11\n");
			
 
				+	STARPU_PLU(display_data_content)(sub11, nx21);
			
 
				+	//fprintf(stderr, "INPUT 21 U21\n");
			
 
				+	fprintf(stderr, "INPUT 12 U12\n");
			
 
				+	STARPU_PLU(display_data_content)(sub21, nx21);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cublasStatus status;
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
			
 
				+					(TYPE)1.0, sub11, ld11, sub21, ld21);
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
			
 
				+					(TYPE)1.0, sub11, ld11, sub21, ld21);
			
 
				+
			
 
				+			status = cublasGetError();
			
 
				+			if (status != CUBLAS_STATUS_SUCCESS)
			
 
				+				STARPU_CUBLAS_REPORT_ERROR(status);
			
 
				+
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+#ifdef VERBOSE_KERNELS
			
 
				+	//fprintf(stderr, "OUTPUT 21 U11\n");
			
 
				+	fprintf(stderr, "OUTPUT 12 U11\n");
			
 
				+	STARPU_PLU(display_data_content)(sub11, nx21);
			
 
				+	//fprintf(stderr, "OUTPUT 21 U21\n");
			
 
				+	fprintf(stderr, "OUTPUT 12 U12\n");
			
 
				+	STARPU_PLU(display_data_content)(sub21, nx21);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(cpu_u21)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u21)(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u21)(descr, 1, _args);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_21) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+#ifdef STARPU_ATLAS
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
			
 
				+#elif defined(STARPU_GOTO)
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_21_goto)
			
 
				+#else
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_21)
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet STARPU_PLU(cl21) = {
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_funcs = {STARPU_PLU(cpu_u21)},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPU_PLU(cublas_u21)},
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW},
			
 
				+	.model = &STARPU_PLU(model_21)
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ *	U11
			
 
				+ */
			
 
				+
			
 
				+static inline void STARPU_PLU(common_u11)(void *descr[],
			
 
				+				int s, STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	TYPE *sub11;
			
 
				+
			
 
				+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+
			
 
				+	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+	unsigned long z;
			
 
				+
			
 
				+#ifdef VERBOSE_KERNELS
			
 
				+	struct debug_info *info = _args;
			
 
				+
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
			
 
				+#endif
			
 
				+
			
 
				+	switch (s) {
			
 
				+		case 0:
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				TYPE pivot;
			
 
				+				pivot = sub11[z+z*ld];
			
 
				+				STARPU_ASSERT(pivot != 0.0);
			
 
				+
			
 
				+				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
			
 
				+
			
 
				+				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
			
 
				+						&sub11[(z+1)+z*ld], 1,
			
 
				+						&sub11[z+(z+1)*ld], ld,
			
 
				+						&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+			}
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case 1:
			
 
				+			for (z = 0; z < nx; z++)
			
 
				+			{
			
 
				+				TYPE pivot;
			
 
				+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+				STARPU_ASSERT(pivot != 0.0);
			
 
				+
			
 
				+				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
			
 
				+
			
 
				+				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
			
 
				+						&sub11[(z+1)+z*ld], 1,
			
 
				+						&sub11[z+(z+1)*ld], ld,
			
 
				+						&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+			}
			
 
				+
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+#ifdef VERBOSE_KERNELS
			
 
				+	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void STARPU_PLU(cpu_u11)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u11)(descr, 0, _args);
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
			
 
				+{
			
 
				+	STARPU_PLU(common_u11)(descr, 1, _args);
			
 
				+}
			
 
				+#endif// STARPU_USE_CUDA
			
 
				+
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_11) = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+#ifdef STARPU_ATLAS
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
			
 
				+#elif defined(STARPU_GOTO)
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_11_goto)
			
 
				+#else
			
 
				+	.symbol = STARPU_PLU_STR(lu_model_11)
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet STARPU_PLU(cl11) = {
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				+	.cpu_funcs = {STARPU_PLU(cpu_u11)},
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {STARPU_PLU(cublas_u11)},
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+	.model = &STARPU_PLU(model_11)
			
 
				+};
			
--- a/nmad/examples/mpi_lu/pxlu_kernels.h
+++ b/nmad/examples/mpi_lu/pxlu_kernels.h
@@ -0,0 +1,32 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PXLU_KERNELS_H__
			
 
				+#define __PXLU_KERNELS_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define str(s) #s
			
 
				+#define xstr(s)        str(s)
			
 
				+#define STARPU_PLU_STR(name)  xstr(STARPU_PLU(name))
			
 
				+
			
 
				+struct starpu_codelet STARPU_PLU(cl11);
			
 
				+struct starpu_codelet STARPU_PLU(cl12);
			
 
				+struct starpu_codelet STARPU_PLU(cl21);
			
 
				+struct starpu_codelet STARPU_PLU(cl22);
			
 
				+
			
 
				+#endif // __PXLU_KERNELS_H__
			
--- a/nmad/examples/mpi_lu/slu_kernels.c
+++ b/nmad/examples/mpi_lu/slu_kernels.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-float.h"
			
 
				+#include "xlu_kernels.c"
			
--- a/nmad/examples/perf.sh
+++ b/nmad/examples/perf.sh
@@ -0,0 +1,106 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+# 
			
 
				+# Copyright (C) 2010  Université de Bordeaux
			
 
				+# Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+# 
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+# 
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+# 
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+# 4G x np = 4 * (k*1K) ^ 2
			
 
				+# A G * np = 4 * k^2 * 1M
			
 
				+# A * 250 * np = k^2
			
 
				+# A = 6
			
 
				+# k = sqrt(1500*np)
			
 
				+# np = 1 => k = 32
			
 
				+# np = 2 => k = 48
			
 
				+# np = 3 => k = 64 
			
 
				+# np = 4 => k = 64
			
 
				+
			
 
				+# Problem size
			
 
				+NBLOCKS=16
			
 
				+BLOCKSIZE=1024
			
 
				+SIZE=$(($NBLOCKS*$BLOCKSIZE))
			
 
				+
			
 
				+echo "JOB ID ${PBS_JOBID}"
			
 
				+
			
 
				+nnodes=$(cat machinefile.${PBS_JOBID}|wc -l)
			
 
				+echo "got $nnodes mpi nodes"
			
 
				+
			
 
				+# Calibrate
			
 
				+ncalibrate=0
			
 
				+for i in `seq 1 $ncalibrate`
			
 
				+do
			
 
				+echo "STARPU_CALIBRATE $i/$ncalibrate"
			
 
				+STARPU_CALIBRATE=1 STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $nnodes ./mpi_lu/plu_example_float -p 2 -q 2 -nblocks 32 -size $((32*$BLOCKSIZE)) -numa
			
 
				+done
			
 
				+
			
 
				+func()
			
 
				+{
			
 
				+ngpus=$1
			
 
				+np=$2
			
 
				+p=$3
			
 
				+q=$4
			
 
				+nblocks=$5
			
 
				+
			
 
				+echo "*******************************************"> log
			
 
				+echo "*************** NGPUS $ngpus - np $np - nblocks $nblocks **************">> log
			
 
				+echo "*******************************************">> log
			
 
				+cat log
			
 
				+cat log >> log.all
			
 
				+
			
 
				+STARPU_NCPUS=0 STARPU_NCUDA=$ngpus STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $np ./mpi_lu/plu_example_float -p $p -q $q -nblocks $nblocks -size $(($nblocks * $BLOCKSIZE)) -numa > log.out 2> log.err
			
 
				+cat log.out > log
			
 
				+cat log.err >> log
			
 
				+cat log
			
 
				+cat log >> log.all
			
 
				+}
			
 
				+
			
 
				+rm -f log.all
			
 
				+
			
 
				+#how many time do we repeat each experiment ?
			
 
				+nloops=3
			
 
				+
			
 
				+per_node_max_memory=7000
			
 
				+
			
 
				+for np in 1 2 4
			
 
				+do
			
 
				+	for nblocks in 16 32 48 64 80
			
 
				+	do
			
 
				+		for ngpus_per_node in 1 2 3 4
			
 
				+		do
			
 
				+			for loop in `seq 1 $nloops`
			
 
				+			do
			
 
				+				# Compute p and q from np
			
 
				+				case $np in
			
 
				+				  1) p=1; q=1;;
			
 
				+				  2) p=2; q=1;;
			
 
				+				  4) p=2; q=2;;
			
 
				+				  *) echo -n "does not support $np nodes yet";;
			
 
				+				esac
			
 
				+
			
 
				+				# Does the problem fit into memory ?
			
 
				+				matrix_size=$(($nblocks * $BLOCKSIZE))
			
 
				+				per_node_memory=$(($((4*$matrix_size*$matrix_size/(1024*1024))) / $np))
			
 
				+
			
 
				+				echo "NP $np P $p Q $q SIZE $per_node_memory NBLOCKS $nblocks"
			
 
				+
			
 
				+				if test $per_node_memory -ge $per_node_max_memory; then
			
 
				+						echo "Problem is too large !"
			
 
				+				else
			
 
				+					func $ngpus_per_node $np $p $q $nblocks
			
 
				+					echo "go !"
			
 
				+				fi
			
 
				+			done
			
 
				+		done
			
 
				+	done
			
 
				+done
			
--- a/nmad/examples/stencil/stencil5.c
+++ b/nmad/examples/stencil/stencil5.c
@@ -0,0 +1,258 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015              Université Bordeaux
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+#define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
			
 
				+    						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
			
 
				+                                                fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				+                                                fflush(ofile); }} while(0);
			
 
				+
			
 
				+void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	float *xy = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	float *xm1y = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	float *xp1y = (float *)STARPU_VARIABLE_GET_PTR(descr[2]);
			
 
				+	float *xym1 = (float *)STARPU_VARIABLE_GET_PTR(descr[3]);
			
 
				+	float *xyp1 = (float *)STARPU_VARIABLE_GET_PTR(descr[4]);
			
 
				+
			
 
				+//	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
			
 
				+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
			
 
				+//	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet stencil5_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {stencil5_cpu},
			
 
				+	.nbuffers = 5,
			
 
				+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER_DEF	100
			
 
				+#  define X         	5
			
 
				+#  define Y         	5
			
 
				+#else
			
 
				+#  define NITER_DEF	100
			
 
				+#  define X         	20
			
 
				+#  define Y         	20
			
 
				+#endif
			
 
				+
			
 
				+int display = 0;
			
 
				+int niter = NITER_DEF;
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x, int y, int nb_nodes)
			
 
				+{
			
 
				+	/* Block distrib */
			
 
				+	return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
			
 
				+}
			
 
				+
			
 
				+/* Shifted distribution, for migration example */
			
 
				+int my_distrib2(int x, int y, int nb_nodes)
			
 
				+{
			
 
				+	return (my_distrib(x, y, nb_nodes) + 1) % nb_nodes;
			
 
				+}
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-iter") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			niter = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+		if (strcmp(argv[i], "-display") == 0)
			
 
				+		{
			
 
				+			display = 1;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int my_rank, size, x, y, loop;
			
 
				+	float mean=0;
			
 
				+	float matrix[X][Y];
			
 
				+	starpu_data_handle_t data_handles[X][Y];
			
 
				+
			
 
				+	int ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	starpu_mpi_init(&argc, &argv, 1);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	/* Initial data values */
			
 
				+	starpu_srand48((long int)time(NULL));
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			matrix[x][y] = (float)starpu_drand48();
			
 
				+			mean += matrix[x][y];
			
 
				+		}
			
 
				+	}
			
 
				+	mean /= (X*Y);
			
 
				+
			
 
				+	if (display)
			
 
				+	{
			
 
				+		FPRINTF_MPI(stdout, "mean=%2.2f\n", mean);
			
 
				+		for(x = 0; x < X; x++)
			
 
				+		{
			
 
				+			fprintf(stdout, "[%d] ", my_rank);
			
 
				+			for (y = 0; y < Y; y++)
			
 
				+			{
			
 
				+				fprintf(stdout, "%2.2f ", matrix[x][y]);
			
 
				+			}
			
 
				+			fprintf(stdout, "\n");
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Initial distribution */
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, y, size);
			
 
				+			if (mpi_rank == my_rank)
			
 
				+			{
			
 
				+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
			
 
				+				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(float));
			
 
				+			}
			
 
				+			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				+				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				+			{
			
 
				+				/* I don't own that index, but will need it for my computations */
			
 
				+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
			
 
				+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* I know it's useless to allocate anything for this */
			
 
				+				data_handles[x][y] = NULL;
			
 
				+			}
			
 
				+			if (data_handles[x][y])
			
 
				+			{
			
 
				+				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* First computation with initial distribution */
			
 
				+	for(loop=0 ; loop<niter; loop++)
			
 
				+	{
			
 
				+		for (x = 1; x < X-1; x++)
			
 
				+		{
			
 
				+			for (y = 1; y < Y-1; y++)
			
 
				+			{
			
 
				+				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
			
 
				+						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
			
 
				+						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
			
 
				+						       0);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	fprintf(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	/* Now migrate data to a new distribution */
			
 
				+
			
 
				+	/* First register newly needed data */
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib2(x, y, size);
			
 
				+			if (!data_handles[x][y] && (mpi_rank == my_rank
			
 
				+				 || my_rank == my_distrib2(x+1, y, size) || my_rank == my_distrib2(x-1, y, size)
			
 
				+				 || my_rank == my_distrib2(x, y+1, size) || my_rank == my_distrib2(x, y-1, size)))
			
 
				+			{
			
 
				+				/* Register newly-needed data */
			
 
				+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
			
 
				+				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
			
 
				+			}
			
 
				+			if (data_handles[x][y] && mpi_rank != starpu_mpi_data_get_rank(data_handles[x][y]))
			
 
				+			{
			
 
				+				/* Migrate the data */
			
 
				+				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
			
 
				+				/* And register new rank of the matrix */
			
 
				+				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Second computation with new distribution */
			
 
				+	for(loop=0 ; loop<niter; loop++)
			
 
				+	{
			
 
				+		for (x = 1; x < X-1; x++)
			
 
				+		{
			
 
				+			for (y = 1; y < Y-1; y++)
			
 
				+			{
			
 
				+				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
			
 
				+						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
			
 
				+						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
			
 
				+						       0);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	fprintf(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+
			
 
				+	/* Unregister data */
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			if (data_handles[x][y])
			
 
				+			{
			
 
				+				int mpi_rank = my_distrib(x, y, size);
			
 
				+				/* Get back data to original place where the user-provided buffer is. */
			
 
				+				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
			
 
				+				/* Register original rank of the matrix (although useless) */
			
 
				+				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
			
 
				+				/* And unregister it */
			
 
				+				starpu_data_unregister(data_handles[x][y]);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (display)
			
 
				+	{
			
 
				+		fprintf(stdout, "[%d] mean=%2.2f\n", my_rank, mean);
			
 
				+		for(x = 0; x < X; x++)
			
 
				+		{
			
 
				+			fprintf(stdout, "[%d] ", my_rank);
			
 
				+			for (y = 0; y < Y; y++)
			
 
				+			{
			
 
				+				fprintf(stdout, "%2.2f ", matrix[x][y]);
			
 
				+			}
			
 
				+			fprintf(stdout, "\n");
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/include/starpu_mpi.h
+++ b/nmad/include/starpu_mpi.h
@@ -0,0 +1,134 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2012, 2014-2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_H__
			
 
				+#define __STARPU_MPI_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#if defined(STARPU_USE_MPI)
			
 
				+
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+typedef void *starpu_mpi_req;
			
 
				+
			
 
				+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
			
 
				+int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm);
			
 
				+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm);
			
 
				+int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
			
 
				+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
			
 
				+int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
			
 
				+int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
			
 
				+int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
			
 
				+int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
			
 
				+int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
			
 
				+int starpu_mpi_barrier(MPI_Comm comm);
			
 
				+
			
 
				+int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency);
			
 
				+
			
 
				+int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm);
			
 
				+int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi);
			
 
				+int starpu_mpi_initialize(void) STARPU_DEPRECATED;
			
 
				+int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
			
 
				+int starpu_mpi_shutdown(void);
			
 
				+
			
 
				+struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
			
 
				+int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
			
 
				+int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...);
			
 
				+/* the function starpu_mpi_insert_task has the same semantics as starpu_mpi_task_insert, it is kept to avoid breaking old codes */
			
 
				+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
			
 
				+
			
 
				+void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
			
 
				+void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
			
 
				+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
			
 
				+
			
 
				+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
			
 
				+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
			
 
				+
			
 
				+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
			
 
				+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
			
 
				+
			
 
				+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
			
 
				+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
			
 
				+
			
 
				+void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
			
 
				+
			
 
				+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
			
 
				+void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
			
 
				+
			
 
				+int starpu_mpi_cached_receive(starpu_data_handle_t data_handle);
			
 
				+int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest);
			
 
				+
			
 
				+int starpu_mpi_comm_size(MPI_Comm comm, int *size);
			
 
				+int starpu_mpi_comm_rank(MPI_Comm comm, int *rank);
			
 
				+int starpu_mpi_world_rank(void);
			
 
				+int starpu_mpi_world_size(void);
			
 
				+
			
 
				+int starpu_mpi_get_communication_tag(void);
			
 
				+void starpu_mpi_set_communication_tag(int tag);
			
 
				+
			
 
				+void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, int tag, int rank, MPI_Comm comm);
			
 
				+#define starpu_mpi_data_register(data_handle, tag, rank) starpu_mpi_data_register_comm(data_handle, tag, rank, MPI_COMM_WORLD)
			
 
				+
			
 
				+void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm);
			
 
				+#define starpu_mpi_data_set_rank(handle, rank) starpu_mpi_data_set_rank_comm(handle, rank, MPI_COMM_WORLD)
			
 
				+void starpu_mpi_data_set_tag(starpu_data_handle_t handle, int tag);
			
 
				+#define starpu_data_set_rank starpu_mpi_data_set_rank
			
 
				+#define starpu_data_set_tag starpu_mpi_data_set_tag
			
 
				+
			
 
				+int starpu_mpi_data_get_rank(starpu_data_handle_t handle);
			
 
				+int starpu_mpi_data_get_tag(starpu_data_handle_t handle);
			
 
				+#define starpu_data_get_rank starpu_mpi_data_get_rank
			
 
				+#define starpu_data_get_tag starpu_mpi_data_get_tag
			
 
				+
			
 
				+void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new_rank);
			
 
				+
			
 
				+#define STARPU_MPI_NODE_SELECTION_CURRENT_POLICY -1
			
 
				+#define STARPU_MPI_NODE_SELECTION_MOST_R_DATA    0
			
 
				+
			
 
				+typedef int (*starpu_mpi_select_node_policy_func_t)(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data);
			
 
				+int starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_policy_func_t policy_func);
			
 
				+int starpu_mpi_node_selection_unregister_policy(int policy);
			
 
				+
			
 
				+int starpu_mpi_node_selection_get_current_policy();
			
 
				+int starpu_mpi_node_selection_set_current_policy(int policy);
			
 
				+
			
 
				+int starpu_mpi_cache_is_enabled();
			
 
				+int starpu_mpi_cache_set(int enabled);
			
 
				+
			
 
				+int starpu_mpi_wait_for_all(MPI_Comm comm);
			
 
				+
			
 
				+typedef void (*starpu_mpi_datatype_allocate_func_t)(starpu_data_handle_t, MPI_Datatype *);
			
 
				+typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
			
 
				+int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
			
 
				+int starpu_mpi_datatype_unregister(starpu_data_handle_t handle);
			
 
				+
			
 
				+int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *));
			
 
				+int starpu_mpi_pre_submit_hook_unregister();
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // STARPU_USE_MPI
			
 
				+#endif // __STARPU_MPI_H__
			
--- a/nmad/libstarpumpi.pc.in
+++ b/nmad/libstarpumpi.pc.in
@@ -0,0 +1,29 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2011, 2016  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012  CNRS
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+Name: starpumpi
			
 
				+Description: offers MPI support for heterogeneous multicore architecture
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -DSTARPU_USE_DEPRECATED_API
			
 
				+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
			
 
				+Requires: libstarpu
			
 
				+Requires.private:
			
--- a/nmad/src/Makefile.am
+++ b/nmad/src/Makefile.am
@@ -0,0 +1,58 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2012  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+CC=$(MPICC)
			
 
				+CCLD=$(MPICC)
			
 
				+
			
 
				+BUILT_SOURCES =
			
 
				+
			
 
				+CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				+
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
			
 
				+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
			
 
				+
			
 
				+lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+
			
 
				+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
			
 
				+  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE) \
			
 
				+  $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
			
 
				+noinst_HEADERS =					\
			
 
				+	starpu_mpi_private.h				\
			
 
				+	starpu_mpi_fxt.h				\
			
 
				+	starpu_mpi_stats.h				\
			
 
				+	starpu_mpi_datatype.h				\
			
 
				+	starpu_mpi_cache.h				\
			
 
				+	starpu_mpi_cache_stats.h			\
			
 
				+	starpu_mpi_collective.c				\
			
 
				+	starpu_mpi_select_node.h
			
 
				+
			
 
				+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
			
 
				+	starpu_mpi.c					\
			
 
				+	starpu_mpi_helper.c				\
			
 
				+	starpu_mpi_datatype.c				\
			
 
				+	starpu_mpi_task_insert.c			\
			
 
				+	starpu_mpi_collective.c				\
			
 
				+	starpu_mpi_stats.c				\
			
 
				+	starpu_mpi_private.c				\
			
 
				+	starpu_mpi_cache.c				\
			
 
				+	starpu_mpi_select_node.c			\
			
 
				+	starpu_mpi_cache_stats.c
			
 
				+
			
 
				+showcheck:
			
 
				+	-cat /dev/null
			
--- a/nmad/src/starpu_mpi.c
+++ b/nmad/src/starpu_mpi.c
--- a/nmad/src/starpu_mpi_cache.c
+++ b/nmad/src/starpu_mpi_cache.c
@@ -0,0 +1,292 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011-2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2014 INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <common/uthash.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+
			
 
				+#include <starpu_mpi_cache.h>
			
 
				+#include <starpu_mpi_cache_stats.h>
			
 
				+#include <starpu_mpi_private.h>
			
 
				+
			
 
				+/* Whether we are allowed to keep copies of remote data. */
			
 
				+struct _starpu_data_entry
			
 
				+{
			
 
				+	UT_hash_handle hh;
			
 
				+	void *data;
			
 
				+};
			
 
				+
			
 
				+static struct _starpu_data_entry **_cache_sent_data = NULL;
			
 
				+static struct _starpu_data_entry **_cache_received_data = NULL;
			
 
				+int _starpu_cache_enabled=1;
			
 
				+
			
 
				+int starpu_mpi_cache_is_enabled()
			
 
				+{
			
 
				+	return _starpu_cache_enabled==1;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_cache_set(int enabled)
			
 
				+{
			
 
				+	if (enabled == 1)
			
 
				+	{
			
 
				+		_starpu_cache_enabled = 1;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		if (_starpu_cache_enabled)
			
 
				+		{
			
 
				+			// We need to clean the cache
			
 
				+			int world_size;
			
 
				+			starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				+			MPI_Comm_size(MPI_COMM_WORLD, &world_size);
			
 
				+			_starpu_mpi_cache_free(world_size);
			
 
				+		}
			
 
				+		_starpu_cache_enabled = 0;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_cache_init(MPI_Comm comm)
			
 
				+{
			
 
				+	int nb_nodes;
			
 
				+	int i;
			
 
				+
			
 
				+	_starpu_cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
			
 
				+	if (_starpu_cache_enabled == -1)
			
 
				+	{
			
 
				+		_starpu_cache_enabled = 1;
			
 
				+	}
			
 
				+
			
 
				+	if (_starpu_cache_enabled == 0)
			
 
				+	{
			
 
				+		if (!_starpu_silent) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	MPI_Comm_size(comm, &nb_nodes);
			
 
				+	_STARPU_MPI_DEBUG(2, "Initialising htable for cache\n");
			
 
				+	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
			
 
				+	for(i=0 ; i<nb_nodes ; i++) _cache_sent_data[i] = NULL;
			
 
				+	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
			
 
				+	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
			
 
				+	_starpu_mpi_cache_stats_init(comm);
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+void _starpu_mpi_cache_empty_tables(int world_size)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	_STARPU_MPI_DEBUG(2, "Clearing htable for cache\n");
			
 
				+
			
 
				+	for(i=0 ; i<world_size ; i++)
			
 
				+	{
			
 
				+		struct _starpu_data_entry *entry, *tmp;
			
 
				+		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
			
 
				+		{
			
 
				+			HASH_DEL(_cache_sent_data[i], entry);
			
 
				+			free(entry);
			
 
				+		}
			
 
				+		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
			
 
				+		{
			
 
				+			HASH_DEL(_cache_received_data[i], entry);
			
 
				+			_starpu_mpi_cache_stats_dec(i, (starpu_data_handle_t) entry->data);
			
 
				+			free(entry);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_cache_free(int world_size)
			
 
				+{
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	_starpu_mpi_cache_empty_tables(world_size);
			
 
				+	free(_cache_sent_data);
			
 
				+	free(_cache_received_data);
			
 
				+	_starpu_mpi_cache_stats_free();
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data)
			
 
				+{
			
 
				+	int n, size;
			
 
				+	MPI_Comm comm = ((struct _starpu_mpi_data *) data->mpi_data)->comm;
			
 
				+
			
 
				+	MPI_Comm_size(comm, &size);
			
 
				+
			
 
				+	for(n=0 ; n<size ; n++)
			
 
				+	{
			
 
				+		struct _starpu_data_entry *already_sent;
			
 
				+		HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
			
 
				+		if (already_sent)
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data);
			
 
				+			HASH_DEL(_cache_sent_data[n], already_sent);
			
 
				+			free(already_sent);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data)
			
 
				+{
			
 
				+	int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				+	struct _starpu_data_entry *already_received;
			
 
				+
			
 
				+	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
			
 
				+	if (already_received)
			
 
				+	{
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#  warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
			
 
				+#endif
			
 
				+		_STARPU_MPI_DEBUG(2, "Clearing receive cache for data %p\n", data);
			
 
				+		HASH_DEL(_cache_received_data[mpi_rank], already_received);
			
 
				+		_starpu_mpi_cache_stats_dec(mpi_rank, data);
			
 
				+		free(already_received);
			
 
				+		starpu_data_invalidate_submit(data);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
			
 
				+{
			
 
				+	int nb_nodes, i;
			
 
				+	int mpi_rank, my_rank;
			
 
				+
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	MPI_Comm_size(comm, &nb_nodes);
			
 
				+	MPI_Comm_rank(comm, &my_rank);
			
 
				+
			
 
				+	for(i=0 ; i<nb_nodes ; i++)
			
 
				+	{
			
 
				+		struct _starpu_data_entry *entry, *tmp;
			
 
				+		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
			
 
				+		{
			
 
				+			mpi_rank = starpu_mpi_data_get_rank((starpu_data_handle_t) entry->data);
			
 
				+			if (mpi_rank != my_rank && mpi_rank != -1)
			
 
				+				starpu_data_invalidate_submit((starpu_data_handle_t) entry->data);
			
 
				+			HASH_DEL(_cache_sent_data[i], entry);
			
 
				+			free(entry);
			
 
				+		}
			
 
				+		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
			
 
				+		{
			
 
				+			mpi_rank = starpu_mpi_data_get_rank((starpu_data_handle_t) entry->data);
			
 
				+			if (mpi_rank != my_rank && mpi_rank != -1)
			
 
				+				starpu_data_invalidate_submit((starpu_data_handle_t) entry->data);
			
 
				+			HASH_DEL(_cache_received_data[i], entry);
			
 
				+			_starpu_mpi_cache_stats_dec(i, (starpu_data_handle_t) entry->data);
			
 
				+			free(entry);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	struct _starpu_data_entry *avail;
			
 
				+	int i, my_rank, nb_nodes;
			
 
				+	int mpi_rank;
			
 
				+	MPI_Comm comm = ((struct _starpu_mpi_data *) data_handle->mpi_data)->comm;
			
 
				+
			
 
				+	if (_starpu_cache_enabled == 0) return;
			
 
				+
			
 
				+	MPI_Comm_size(comm, &nb_nodes);
			
 
				+	MPI_Comm_rank(comm, &my_rank);
			
 
				+	mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				+
			
 
				+	for(i=0 ; i<nb_nodes ; i++)
			
 
				+	{
			
 
				+		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
			
 
				+		if (avail)
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
			
 
				+			HASH_DEL(_cache_sent_data[i], avail);
			
 
				+			free(avail);
			
 
				+		}
			
 
				+		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
			
 
				+		if (avail)
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
			
 
				+			HASH_DEL(_cache_received_data[i], avail);
			
 
				+			_starpu_mpi_cache_stats_dec(i, data_handle);
			
 
				+			free(avail);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	int my_rank, mpi_rank;
			
 
				+	_starpu_mpi_cache_flush( data_handle);
			
 
				+
			
 
				+	MPI_Comm_rank(comm, &my_rank);
			
 
				+	mpi_rank = starpu_mpi_data_get_rank(data_handle);
			
 
				+	if (mpi_rank != my_rank && mpi_rank != -1)
			
 
				+		starpu_data_invalidate_submit(data_handle);
			
 
				+}
			
 
				+
			
 
				+void *_starpu_mpi_cache_received_data_set(starpu_data_handle_t data)
			
 
				+{
			
 
				+	int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				+	if (_starpu_cache_enabled == 0) return NULL;
			
 
				+
			
 
				+	struct _starpu_data_entry *already_received;
			
 
				+	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
			
 
				+	if (already_received == NULL)
			
 
				+	{
			
 
				+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
			
 
				+		entry->data = data;
			
 
				+		HASH_ADD_PTR(_cache_received_data[mpi_rank], data, entry);
			
 
				+		_starpu_mpi_cache_stats_inc(mpi_rank, data);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
			
 
				+	}
			
 
				+	return already_received;
			
 
				+}
			
 
				+
			
 
				+void *_starpu_mpi_cache_received_data_get(starpu_data_handle_t data)
			
 
				+{
			
 
				+	int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				+	struct _starpu_data_entry *already_received;
			
 
				+
			
 
				+	if (_starpu_cache_enabled == 0) return NULL;
			
 
				+	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
			
 
				+	return already_received;
			
 
				+}
			
 
				+
			
 
				+void *_starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest)
			
 
				+{
			
 
				+	if (_starpu_cache_enabled == 0) return NULL;
			
 
				+
			
 
				+	struct _starpu_data_entry *already_sent;
			
 
				+	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
			
 
				+	if (already_sent == NULL)
			
 
				+	{
			
 
				+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
			
 
				+		entry->data = data;
			
 
				+		HASH_ADD_PTR(_cache_sent_data[dest], data, entry);
			
 
				+		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been sent to %d\n", data, dest);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data, dest);
			
 
				+	}
			
 
				+	return already_sent;
			
 
				+}
			
 
				+
			
--- a/nmad/src/starpu_mpi_cache.h
+++ b/nmad/src/starpu_mpi_cache.h
@@ -0,0 +1,55 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011-2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2014 INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_CACHE_H__
			
 
				+#define __STARPU_MPI_CACHE_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+extern int _starpu_cache_enabled;
			
 
				+void _starpu_mpi_cache_init(MPI_Comm comm);
			
 
				+void _starpu_mpi_cache_free(int world_size);
			
 
				+
			
 
				+/*
			
 
				+ * If the data is already available in the cache, return a pointer to the data
			
 
				+ * If the data is NOT available in the cache, add it to the cache and return NULL
			
 
				+ */
			
 
				+void *_starpu_mpi_cache_received_data_set(starpu_data_handle_t data);
			
 
				+void *_starpu_mpi_cache_received_data_get(starpu_data_handle_t data);
			
 
				+void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data);
			
 
				+
			
 
				+/*
			
 
				+ * If the data is already available in the cache, return a pointer to the data
			
 
				+ * If the data is NOT available in the cache, add it to the cache and return NULL
			
 
				+ */
			
 
				+void *_starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest);
			
 
				+void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data);
			
 
				+
			
 
				+void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // __STARPU_MPI_CACHE_H__
			
--- a/nmad/src/starpu_mpi_cache_stats.c
+++ b/nmad/src/starpu_mpi_cache_stats.c
@@ -0,0 +1,69 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi_cache_stats.h>
			
 
				+#include <common/config.h>
			
 
				+#include <stdio.h>
			
 
				+#include <starpu_mpi_private.h>
			
 
				+
			
 
				+/* measure the amount of data transfers between each pair of MPI nodes */
			
 
				+static size_t *comm_cache_amount;
			
 
				+static int world_size;
			
 
				+static int stats_enabled=0;
			
 
				+
			
 
				+void _starpu_mpi_cache_stats_init(MPI_Comm comm)
			
 
				+{
			
 
				+	stats_enabled = starpu_get_env_number("STARPU_MPI_CACHE_STATS");
			
 
				+	if (stats_enabled == -1)
			
 
				+	{
			
 
				+		stats_enabled = 0;
			
 
				+	}
			
 
				+	if (stats_enabled == 0) return;
			
 
				+
			
 
				+	if (!_starpu_silent) fprintf(stderr,"Warning: StarPU is executed with STARPU_MPI_CACHE_STATS=1, which slows down a bit\n");
			
 
				+
			
 
				+	MPI_Comm_size(comm, &world_size);
			
 
				+	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
			
 
				+
			
 
				+	comm_cache_amount = (size_t *) calloc(world_size, sizeof(size_t));
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_cache_stats_free()
			
 
				+{
			
 
				+	if (stats_enabled == 0) return;
			
 
				+	free(comm_cache_amount);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count)
			
 
				+{
			
 
				+	size_t size;
			
 
				+
			
 
				+	if (stats_enabled == 0) return;
			
 
				+
			
 
				+	size = starpu_data_get_size(data_handle);
			
 
				+
			
 
				+	if (count == 1)
			
 
				+	{
			
 
				+		_STARPU_MPI_MSG("[communication cache] + %10ld to   %d\n", (long)size, dst);
			
 
				+	}
			
 
				+	else // count == -1
			
 
				+	{
			
 
				+		_STARPU_MPI_MSG("[communication cache] - %10ld from %d\n", (long)size, dst);
			
 
				+	}
			
 
				+
			
 
				+	comm_cache_amount[dst] += count * size;
			
 
				+}
			
 
				+
			
--- a/nmad/src/starpu_mpi_cache_stats.h
+++ b/nmad/src/starpu_mpi_cache_stats.h
@@ -0,0 +1,40 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_CACHE_STATS_H__
			
 
				+#define __STARPU_MPI_CACHE_STATS_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+void _starpu_mpi_cache_stats_init(MPI_Comm comm);
			
 
				+void _starpu_mpi_cache_stats_free();
			
 
				+
			
 
				+void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count);
			
 
				+
			
 
				+#define _starpu_mpi_cache_stats_inc(dst, data_handle) _starpu_mpi_cache_stats_update(dst, data_handle, +1)
			
 
				+#define _starpu_mpi_cache_stats_dec(dst, data_handle) _starpu_mpi_cache_stats_update(dst, data_handle, -1)
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // __STARPU_MPI_CACHE_STATS_H__
			
--- a/nmad/src/starpu_mpi_collective.c
+++ b/nmad/src/starpu_mpi_collective.c
@@ -0,0 +1,162 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <mpi.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <starpu_mpi_private.h>
			
 
				+
			
 
				+struct _callback_arg
			
 
				+{
			
 
				+	void (*callback)(void *);
			
 
				+	void *arg;
			
 
				+	int nb;
			
 
				+	int count;
			
 
				+};
			
 
				+
			
 
				+void _callback_collective(void *arg)
			
 
				+{
			
 
				+	struct _callback_arg *callback_arg = arg;
			
 
				+	callback_arg->nb ++;
			
 
				+	if (callback_arg->nb == callback_arg->count)
			
 
				+	{
			
 
				+		callback_arg->callback(callback_arg->arg);
			
 
				+		free(callback_arg);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int x;
			
 
				+	struct _callback_arg *callback_arg = NULL;
			
 
				+	void (*callback_func)(void *) = NULL;
			
 
				+	void (*callback)(void *);
			
 
				+
			
 
				+	MPI_Comm_rank(comm, &rank);
			
 
				+
			
 
				+	callback = (rank == root) ? scallback : rcallback;
			
 
				+	if (callback)
			
 
				+	{
			
 
				+		callback_func = _callback_collective;
			
 
				+		callback_arg = malloc(sizeof(struct _callback_arg));
			
 
				+		callback_arg->count = 0;
			
 
				+		callback_arg->nb = 0;
			
 
				+		callback_arg->callback = (rank == root) ? scallback : rcallback;
			
 
				+		callback_arg->arg = (rank == root) ? sarg : rarg;
			
 
				+
			
 
				+		for(x = 0; x < count ; x++)
			
 
				+		{
			
 
				+			if (data_handles[x])
			
 
				+			{
			
 
				+				int owner = starpu_mpi_data_get_rank(data_handles[x]);
			
 
				+				int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
			
 
				+				STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
			
 
				+				if ((rank == root) && (owner != root))
			
 
				+				{
			
 
				+					callback_arg->count ++;
			
 
				+				}
			
 
				+				if ((rank != root) && (owner == rank))
			
 
				+				{
			
 
				+					callback_arg->count ++;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for(x = 0; x < count ; x++)
			
 
				+	{
			
 
				+		if (data_handles[x])
			
 
				+		{
			
 
				+			int owner = starpu_mpi_data_get_rank(data_handles[x]);
			
 
				+			int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
			
 
				+			STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
			
 
				+			if ((rank == root) && (owner != root))
			
 
				+			{
			
 
				+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
			
 
				+				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
			
 
				+			}
			
 
				+			if ((rank != root) && (owner == rank))
			
 
				+			{
			
 
				+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
			
 
				+				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int x;
			
 
				+	struct _callback_arg *callback_arg = NULL;
			
 
				+	void (*callback_func)(void *) = NULL;
			
 
				+	void (*callback)(void *);
			
 
				+
			
 
				+	MPI_Comm_rank(comm, &rank);
			
 
				+
			
 
				+	callback = (rank == root) ? scallback : rcallback;
			
 
				+	if (callback)
			
 
				+	{
			
 
				+		callback_func = _callback_collective;
			
 
				+
			
 
				+		callback_arg = malloc(sizeof(struct _callback_arg));
			
 
				+		callback_arg->count = 0;
			
 
				+		callback_arg->nb = 0;
			
 
				+		callback_arg->callback = callback;
			
 
				+		callback_arg->arg = (rank == root) ? sarg : rarg;
			
 
				+
			
 
				+		for(x = 0; x < count ; x++)
			
 
				+		{
			
 
				+			if (data_handles[x])
			
 
				+			{
			
 
				+				int owner = starpu_mpi_data_get_rank(data_handles[x]);
			
 
				+				int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
			
 
				+				STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
			
 
				+				if ((rank == root) && (owner != root))
			
 
				+				{
			
 
				+					callback_arg->count ++;
			
 
				+				}
			
 
				+				if ((rank != root) && (owner == rank))
			
 
				+				{
			
 
				+					callback_arg->count ++;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for(x = 0; x < count ; x++)
			
 
				+	{
			
 
				+		if (data_handles[x])
			
 
				+		{
			
 
				+			int owner = starpu_mpi_data_get_rank(data_handles[x]);
			
 
				+			int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
			
 
				+			STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
			
 
				+			if ((rank == root) && (owner != root))
			
 
				+			{
			
 
				+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
			
 
				+				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
			
 
				+			}
			
 
				+			if ((rank != root) && (owner == rank))
			
 
				+			{
			
 
				+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
			
 
				+				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/src/starpu_mpi_datatype.c
+++ b/nmad/src/starpu_mpi_datatype.c
@@ -0,0 +1,245 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi_datatype.h>
			
 
				+
			
 
				+typedef void (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
			
 
				+typedef void (*handle_free_datatype_func)(MPI_Datatype *);
			
 
				+
			
 
				+/*
			
 
				+ * 	Matrix
			
 
				+ */
			
 
				+
			
 
				+static void handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	unsigned nx = starpu_matrix_get_nx(data_handle);
			
 
				+	unsigned ny = starpu_matrix_get_ny(data_handle);
			
 
				+	unsigned ld = starpu_matrix_get_local_ld(data_handle);
			
 
				+	size_t elemsize = starpu_matrix_get_elemsize(data_handle);
			
 
				+
			
 
				+	ret = MPI_Type_vector(ny, nx*elemsize, ld*elemsize, MPI_BYTE, datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_vector failed");
			
 
				+
			
 
				+	ret = MPI_Type_commit(datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * 	Block
			
 
				+ */
			
 
				+
			
 
				+static void handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	unsigned nx = starpu_block_get_nx(data_handle);
			
 
				+	unsigned ny = starpu_block_get_ny(data_handle);
			
 
				+	unsigned nz = starpu_block_get_nz(data_handle);
			
 
				+	unsigned ldy = starpu_block_get_local_ldy(data_handle);
			
 
				+	unsigned ldz = starpu_block_get_local_ldz(data_handle);
			
 
				+	size_t elemsize = starpu_block_get_elemsize(data_handle);
			
 
				+
			
 
				+	MPI_Datatype datatype_2dlayer;
			
 
				+	ret = MPI_Type_vector(ny, nx*elemsize, ldy*elemsize, MPI_BYTE, &datatype_2dlayer);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_vector failed");
			
 
				+
			
 
				+	ret = MPI_Type_commit(&datatype_2dlayer);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
			
 
				+
			
 
				+	ret = MPI_Type_hvector(nz, 1, ldz*elemsize, datatype_2dlayer, datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_hvector failed");
			
 
				+
			
 
				+	ret = MPI_Type_commit(datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * 	Vector
			
 
				+ */
			
 
				+
			
 
				+static void handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	unsigned nx = starpu_vector_get_nx(data_handle);
			
 
				+	size_t elemsize = starpu_vector_get_elemsize(data_handle);
			
 
				+
			
 
				+	ret = MPI_Type_contiguous(nx*elemsize, MPI_BYTE, datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
			
 
				+
			
 
				+	ret = MPI_Type_commit(datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * 	Variable
			
 
				+ */
			
 
				+
			
 
				+static void handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	size_t elemsize = starpu_variable_get_elemsize(data_handle);
			
 
				+
			
 
				+	ret = MPI_Type_contiguous(elemsize, MPI_BYTE, datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
			
 
				+
			
 
				+	ret = MPI_Type_commit(datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * 	Void
			
 
				+ */
			
 
				+
			
 
				+static void handle_to_datatype_void(starpu_data_handle_t data_handle STARPU_ATTRIBUTE_UNUSED, MPI_Datatype *datatype)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = MPI_Type_contiguous(0, MPI_BYTE, datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
			
 
				+
			
 
				+	ret = MPI_Type_commit(datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Generic
			
 
				+ */
			
 
				+
			
 
				+static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
			
 
				+{
			
 
				+	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
			
 
				+	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
			
 
				+	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
			
 
				+	[STARPU_CSR_INTERFACE_ID]	= NULL,
			
 
				+	[STARPU_BCSR_INTERFACE_ID]	= NULL,
			
 
				+	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
			
 
				+	[STARPU_VOID_INTERFACE_ID]	= handle_to_datatype_void,
			
 
				+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
			
 
				+};
			
 
				+
			
 
				+void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype)
			
 
				+{
			
 
				+	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
			
 
				+
			
 
				+	if (id < STARPU_MAX_INTERFACE_ID)
			
 
				+	{
			
 
				+		handle_to_datatype_func func = handle_to_datatype_funcs[id];
			
 
				+		STARPU_ASSERT_MSG(func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
			
 
				+		func(data_handle, datatype);
			
 
				+		*user_datatype = 0;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* The datatype is not predefined by StarPU */
			
 
				+		*datatype = MPI_BYTE;
			
 
				+		*user_datatype = 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void _starpu_mpi_handle_free_simple_datatype(MPI_Datatype *datatype)
			
 
				+{
			
 
				+	MPI_Type_free(datatype);
			
 
				+}
			
 
				+
			
 
				+static void _starpu_mpi_handle_free_complex_datatype(MPI_Datatype *datatype)
			
 
				+{
			
 
				+	int num_ints, num_adds, num_datatypes, combiner, i;
			
 
				+	int *array_of_ints;
			
 
				+	MPI_Aint *array_of_adds;
			
 
				+	MPI_Datatype *array_of_datatypes;
			
 
				+
			
 
				+	MPI_Type_get_envelope(*datatype, &num_ints, &num_adds, &num_datatypes, &combiner);
			
 
				+	if (combiner != MPI_COMBINER_NAMED)
			
 
				+	{
			
 
				+		array_of_ints = (int *) malloc(num_ints * sizeof(int));
			
 
				+		array_of_adds = (MPI_Aint *) malloc(num_adds * sizeof(MPI_Aint));
			
 
				+		array_of_datatypes = (MPI_Datatype *) malloc(num_datatypes * sizeof(MPI_Datatype));
			
 
				+		MPI_Type_get_contents(*datatype, num_ints, num_adds, num_datatypes, array_of_ints, array_of_adds, array_of_datatypes);
			
 
				+		for(i=0 ; i<num_datatypes ; i++)
			
 
				+		{
			
 
				+			_starpu_mpi_handle_free_complex_datatype(&array_of_datatypes[i]);
			
 
				+		}
			
 
				+		MPI_Type_free(datatype);
			
 
				+		free(array_of_ints);
			
 
				+		free(array_of_adds);
			
 
				+		free(array_of_datatypes);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static handle_free_datatype_func handle_free_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
			
 
				+{
			
 
				+	[STARPU_MATRIX_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
			
 
				+	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
			
 
				+	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
			
 
				+	[STARPU_CSR_INTERFACE_ID]	= NULL,
			
 
				+	[STARPU_BCSR_INTERFACE_ID]	= NULL,
			
 
				+	[STARPU_VARIABLE_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
			
 
				+	[STARPU_VOID_INTERFACE_ID]      = _starpu_mpi_handle_free_simple_datatype,
			
 
				+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
			
 
				+};
			
 
				+
			
 
				+void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
			
 
				+{
			
 
				+	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
			
 
				+
			
 
				+	if (id < STARPU_MAX_INTERFACE_ID)
			
 
				+	{
			
 
				+		handle_free_datatype_func func = handle_free_datatype_funcs[id];
			
 
				+		STARPU_ASSERT_MSG(func, "Handle free datatype function not defined for StarPU data interface %d", id);
			
 
				+		func(datatype);
			
 
				+	}
			
 
				+	/* else the datatype is not predefined by StarPU */
			
 
				+}
			
 
				+
			
 
				+char *_starpu_mpi_datatype(MPI_Datatype datatype)
			
 
				+{
			
 
				+     if (datatype == MPI_DATATYPE_NULL) return "MPI_DATATYPE_NULL";
			
 
				+     if (datatype == MPI_CHAR) return "MPI_CHAR";
			
 
				+     if (datatype == MPI_UNSIGNED_CHAR) return "MPI_UNSIGNED_CHAR";
			
 
				+     if (datatype == MPI_BYTE) return "MPI_BYTE";
			
 
				+     if (datatype == MPI_SHORT) return "MPI_SHORT";
			
 
				+     if (datatype == MPI_UNSIGNED_SHORT) return "MPI_UNSIGNED_SHORT";
			
 
				+     if (datatype == MPI_INT) return "MPI_INT";
			
 
				+     if (datatype == MPI_UNSIGNED) return "MPI_UNSIGNED";
			
 
				+     if (datatype == MPI_LONG) return "MPI_LONG";
			
 
				+     if (datatype == MPI_UNSIGNED_LONG) return "MPI_UNSIGNED_LONG";
			
 
				+     if (datatype == MPI_FLOAT) return "MPI_FLOAT";
			
 
				+     if (datatype == MPI_DOUBLE) return "MPI_DOUBLE";
			
 
				+     if (datatype == MPI_LONG_DOUBLE) return "MPI_LONG_DOUBLE";
			
 
				+     if (datatype == MPI_LONG_LONG) return "MPI_LONG_LONG";
			
 
				+     if (datatype == MPI_LONG_INT) return "MPI_LONG_INT";
			
 
				+     if (datatype == MPI_SHORT_INT) return "MPI_SHORT_INT";
			
 
				+     if (datatype == MPI_FLOAT_INT) return "MPI_FLOAT_INT";
			
 
				+     if (datatype == MPI_DOUBLE_INT) return "MPI_DOUBLE_INT";
			
 
				+     if (datatype == MPI_2INT) return "MPI_2INT";
			
 
				+     if (datatype == MPI_2DOUBLE_PRECISION) return "MPI_2DOUBLE_PRECISION";
			
 
				+     if (datatype == MPI_COMPLEX) return "MPI_COMPLEX";
			
 
				+     if (datatype == MPI_DOUBLE_COMPLEX) return "MPI_DOUBLE_COMPLEX";
			
 
				+     if (datatype == MPI_LOGICAL) return "MPI_LOGICAL";
			
 
				+     if (datatype == MPI_REAL) return "MPI_REAL";
			
 
				+     if (datatype == MPI_REAL4) return "MPI_REAL4";
			
 
				+     if (datatype == MPI_REAL8) return "MPI_REAL8";
			
 
				+     if (datatype == MPI_DOUBLE_PRECISION) return "MPI_DOUBLE_PRECISION";
			
 
				+     if (datatype == MPI_INTEGER) return "MPI_INTEGER";
			
 
				+     if (datatype == MPI_INTEGER4) return "MPI_INTEGER4";
			
 
				+     if (datatype == MPI_PACKED) return "MPI_PACKED";
			
 
				+     return "User defined MPI Datatype";
			
 
				+}
			
--- a/nmad/src/starpu_mpi_datatype.h
+++ b/nmad/src/starpu_mpi_datatype.h
@@ -0,0 +1,35 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_DATATYPE_H__
			
 
				+#define __STARPU_MPI_DATATYPE_H__
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype);
			
 
				+void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
			
 
				+char *_starpu_mpi_datatype(MPI_Datatype datatype);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // __STARPU_MPI_DATATYPE_H__
			
--- a/nmad/src/starpu_mpi_fxt.h
+++ b/nmad/src/starpu_mpi_fxt.h
@@ -0,0 +1,116 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_FXT_H__
			
 
				+#define __STARPU_MPI_FXT_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <common/config.h>
			
 
				+#include <common/fxt.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#define FUT_MPI_START				0x5201
			
 
				+#define FUT_MPI_STOP				0x5202
			
 
				+#define FUT_MPI_BARRIER				0x5203
			
 
				+#define FUT_MPI_ISEND_SUBMIT_BEGIN		0x5204
			
 
				+#define FUT_MPI_ISEND_SUBMIT_END		0x5205
			
 
				+#define FUT_MPI_IRECV_SUBMIT_BEGIN		0x5206
			
 
				+#define FUT_MPI_IRECV_SUBMIT_END		0x5207
			
 
				+#define FUT_MPI_ISEND_COMPLETE_BEGIN		0x5208
			
 
				+#define FUT_MPI_ISEND_COMPLETE_END		0x5209
			
 
				+#define FUT_MPI_IRECV_COMPLETE_BEGIN		0x5210
			
 
				+#define FUT_MPI_IRECV_COMPLETE_END		0x5211
			
 
				+#define FUT_MPI_SLEEP_BEGIN			0x5212
			
 
				+#define FUT_MPI_SLEEP_END			0x5213
			
 
				+#define FUT_MPI_DTESTING_BEGIN			0x5214
			
 
				+#define FUT_MPI_DTESTING_END			0x5215
			
 
				+#define FUT_MPI_UTESTING_BEGIN			0x5216
			
 
				+#define FUT_MPI_UTESTING_END			0x5217
			
 
				+#define FUT_MPI_UWAIT_BEGIN			0x5218
			
 
				+#define FUT_MPI_UWAIT_END			0x5219
			
 
				+
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+#define TRACE_MPI_START(rank, worldsize)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_START, (rank), (worldsize), _starpu_gettid());
			
 
				+#define TRACE_MPI_STOP(rank, worldsize)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_STOP, (rank), (worldsize), _starpu_gettid());
			
 
				+#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
			
 
				+	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
			
 
				+#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
			
 
				+	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
			
 
				+	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
			
 
				+	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
			
 
				+	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
			
 
				+#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define TRACE_MPI_SLEEP_BEGIN()	\
			
 
				+	FUT_DO_PROBE1(FUT_MPI_SLEEP_BEGIN, _starpu_gettid());
			
 
				+#define TRACE_MPI_SLEEP_END()	\
			
 
				+	FUT_DO_PROBE1(FUT_MPI_SLEEP_END, _starpu_gettid());
			
 
				+#define TRACE_MPI_DTESTING_BEGIN()	\
			
 
				+	FUT_DO_PROBE1(FUT_MPI_DTESTING_BEGIN,  _starpu_gettid());
			
 
				+#define TRACE_MPI_DTESTING_END()	\
			
 
				+	FUT_DO_PROBE1(FUT_MPI_DTESTING_END, _starpu_gettid());
			
 
				+#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
			
 
				+#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
			
 
				+#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
			
 
				+	FUT_DO_PROBE3(FUT_MPI_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
			
 
				+#define TRACE
			
 
				+#else
			
 
				+#define TRACE_MPI_START(a, b)				do {} while(0);
			
 
				+#define TRACE_MPI_STOP(a, b)				do {} while(0);
			
 
				+#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
			
 
				+#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
			
 
				+#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
			
 
				+#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
			
 
				+#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
			
 
				+#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
			
 
				+#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
			
 
				+#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
			
 
				+#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
			
 
				+#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
			
 
				+#define TRACE_MPI_SLEEP_END()				do {} while(0);
			
 
				+#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
			
 
				+#define TRACE_MPI_DTESTING_END()			do {} while(0);
			
 
				+#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
			
 
				+#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
			
 
				+#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
			
 
				+#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#endif // __STARPU_MPI_FXT_H__
			
--- a/nmad/src/starpu_mpi_helper.c
+++ b/nmad/src/starpu_mpi_helper.c
@@ -0,0 +1,105 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+
			
 
				+static void starpu_mpi_unlock_tag_callback(void *arg)
			
 
				+{
			
 
				+	starpu_tag_t *tagptr = arg;
			
 
				+
			
 
				+	starpu_tag_notify_from_apps(*tagptr);
			
 
				+
			
 
				+	free(tagptr);
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle,
			
 
				+				int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				+{
			
 
				+	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
			
 
				+	*tagptr = tag;
			
 
				+
			
 
				+	return starpu_mpi_isend_detached(data_handle, dest, mpi_tag, comm,
			
 
				+						starpu_mpi_unlock_tag_callback, tagptr);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
			
 
				+{
			
 
				+	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
			
 
				+	*tagptr = tag;
			
 
				+
			
 
				+	return starpu_mpi_irecv_detached(data_handle, source, mpi_tag, comm,
			
 
				+						starpu_mpi_unlock_tag_callback, tagptr);
			
 
				+}
			
 
				+
			
 
				+struct arg_array
			
 
				+{
			
 
				+	int array_size;
			
 
				+	starpu_tag_t tag;
			
 
				+};
			
 
				+
			
 
				+static void starpu_mpi_array_unlock_callback(void *_arg)
			
 
				+{
			
 
				+	struct arg_array *arg = _arg;
			
 
				+
			
 
				+	int remaining = STARPU_ATOMIC_ADD(&arg->array_size, -1);
			
 
				+
			
 
				+	if (remaining == 0)
			
 
				+	{
			
 
				+		starpu_tag_notify_from_apps(arg->tag);
			
 
				+		free(arg);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
			
 
				+		starpu_data_handle_t *data_handle, int *dest, int *mpi_tag,
			
 
				+		MPI_Comm *comm, starpu_tag_t tag)
			
 
				+{
			
 
				+	struct arg_array *arg = malloc(sizeof(struct arg_array));
			
 
				+
			
 
				+	arg->array_size = array_size;
			
 
				+	arg->tag = tag;
			
 
				+
			
 
				+	unsigned elem;
			
 
				+	for (elem = 0; elem < array_size; elem++)
			
 
				+	{
			
 
				+		starpu_mpi_isend_detached(data_handle[elem], dest[elem],
			
 
				+				mpi_tag[elem], comm[elem],
			
 
				+				starpu_mpi_array_unlock_callback, arg);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
			
 
				+{
			
 
				+	struct arg_array *arg = malloc(sizeof(struct arg_array));
			
 
				+
			
 
				+	arg->array_size = array_size;
			
 
				+	arg->tag = tag;
			
 
				+
			
 
				+	unsigned elem;
			
 
				+	for (elem = 0; elem < array_size; elem++)
			
 
				+	{
			
 
				+		starpu_mpi_irecv_detached(data_handle[elem], source[elem],
			
 
				+				mpi_tag[elem], comm[elem],
			
 
				+				starpu_mpi_array_unlock_callback, arg);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/src/starpu_mpi_private.c
+++ b/nmad/src/starpu_mpi_private.c
@@ -0,0 +1,25 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+int _starpu_debug_rank=-1;
			
 
				+int _starpu_debug_level=0;
			
 
				+
			
 
				+void _starpu_mpi_set_debug_level(int level)
			
 
				+{
			
 
				+	_starpu_debug_level = level;
			
 
				+}
			
 
				+
			
--- a/nmad/src/starpu_mpi_private.h
+++ b/nmad/src/starpu_mpi_private.h
@@ -0,0 +1,173 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2012-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_PRIVATE_H__
			
 
				+#define __STARPU_MPI_PRIVATE_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <common/config.h>
			
 
				+#include "starpu_mpi.h"
			
 
				+#include "starpu_mpi_fxt.h"
			
 
				+#include <common/list.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+extern int _starpu_debug_rank;
			
 
				+
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+extern int _starpu_debug_level;
			
 
				+void _starpu_mpi_set_debug_level(int level);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_NO_ASSERT
			
 
				+#  define STARPU_MPI_ASSERT_MSG(x, msg, ...)	do { if (0) { (void) (x); }} while(0)
			
 
				+#else
			
 
				+#  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
			
 
				+int _starpu_debug_rank;
			
 
				+#    define STARPU_MPI_ASSERT_MSG(x, msg, ...)									\
			
 
				+	do													\
			
 
				+	{ 													\
			
 
				+		if (STARPU_UNLIKELY(!(x))) 									\
			
 
				+		{												\
			
 
				+			if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
			
 
				+			fprintf(stderr, "\n[%d][starpu_mpi][%s][assert failure] " msg "\n\n", _starpu_debug_rank, __starpu_func__, ## __VA_ARGS__); *(int*)NULL = 0; \
			
 
				+		} \
			
 
				+	} while(0)
			
 
				+#  else
			
 
				+#    define STARPU_MPI_ASSERT_MSG(x, msg, ...)	\
			
 
				+	do \
			
 
				+	{ \
			
 
				+		if (STARPU_UNLIKELY(!(x))) \
			
 
				+		{ \
			
 
				+			if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
			
 
				+			fprintf(stderr, "\n[%d][starpu_mpi][%s][assert failure] " msg "\n\n", _starpu_debug_rank, __starpu_func__, ## __VA_ARGS__); \
			
 
				+		} \
			
 
				+		assert(x); \
			
 
				+	} while(0)
			
 
				+
			
 
				+#  endif
			
 
				+#endif
			
 
				+	
			
 
				+#define _STARPU_MPI_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) size); } while (0)
			
 
				+#define _STARPU_MPI_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
			
 
				+#define _STARPU_MPI_REALLOC(ptr, size) do { ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); } while (0)
			
 
				+
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+#  define _STARPU_MPI_DEBUG(level, fmt, ...) \
			
 
				+	do \
			
 
				+	{								\
			
 
				+		if (!_starpu_silent && level <= _starpu_debug_level)	\
			
 
				+		{							\
			
 
				+			if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
			
 
				+			fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				+			fflush(stderr); \
			
 
				+		}			\
			
 
				+	} while(0);
			
 
				+#else
			
 
				+#  define _STARPU_MPI_DEBUG(level, fmt, ...)
			
 
				+#endif
			
 
				+
			
 
				+#define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
			
 
				+	       				     if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
			
 
				+                                             fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				+                                             fflush(stderr); }} while(0);
			
 
				+#define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
			
 
				+                                             fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
			
 
				+                                             fflush(stderr); } while(0);
			
 
				+
			
 
				+#ifdef STARPU_VERBOSE0
			
 
				+#  define _STARPU_MPI_LOG_IN()             do { if (!_starpu_silent) { \
			
 
				+                                               if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank);                        \
			
 
				+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] -->\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ); \
			
 
				+                                               fflush(stderr); }} while(0)
			
 
				+#  define _STARPU_MPI_LOG_OUT()            do { if (!_starpu_silent) { \
			
 
				+                                               if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank);                        \
			
 
				+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] <--\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ); \
			
 
				+                                               fflush(stderr); }} while(0)
			
 
				+#else
			
 
				+#  define _STARPU_MPI_LOG_IN()
			
 
				+#  define _STARPU_MPI_LOG_OUT()
			
 
				+#endif
			
 
				+
			
 
				+enum _starpu_mpi_request_type
			
 
				+{
			
 
				+	SEND_REQ=0,
			
 
				+	RECV_REQ=1,
			
 
				+	WAIT_REQ=2,
			
 
				+	TEST_REQ=3,
			
 
				+	BARRIER_REQ=4,
			
 
				+	PROBE_REQ=5
			
 
				+};
			
 
				+
			
 
				+LIST_TYPE(_starpu_mpi_req,
			
 
				+	/* description of the data at StarPU level */
			
 
				+	starpu_data_handle_t data_handle;
			
 
				+
			
 
				+	/* description of the data to be sent/received */
			
 
				+	MPI_Datatype datatype;
			
 
				+	void *ptr;
			
 
				+	starpu_ssize_t count;
			
 
				+	int user_datatype;
			
 
				+
			
 
				+	/* who are we talking to ? */
			
 
				+	int srcdst;
			
 
				+	int mpi_tag;
			
 
				+	MPI_Comm comm;
			
 
				+
			
 
				+	void (*func)(struct _starpu_mpi_req *);
			
 
				+
			
 
				+	MPI_Status *status;
			
 
				+	MPI_Request request;
			
 
				+	int *flag;
			
 
				+	unsigned sync;
			
 
				+
			
 
				+	int ret;
			
 
				+	starpu_pthread_mutex_t req_mutex;
			
 
				+	starpu_pthread_cond_t req_cond;
			
 
				+
			
 
				+	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
			
 
				+
			
 
				+	unsigned submitted;
			
 
				+	unsigned completed;
			
 
				+
			
 
				+	/* In the case of a Wait/Test request, we are going to post a request
			
 
				+	 * to test the completion of another request */
			
 
				+	struct _starpu_mpi_req *other_request;
			
 
				+
			
 
				+	/* in the case of detached requests */
			
 
				+	unsigned detached;
			
 
				+	void *callback_arg;
			
 
				+	void (*callback)(void *);
			
 
				+
			
 
				+        /* in the case of user-defined datatypes, we need to send the size of the data */
			
 
				+	MPI_Request size_req;
			
 
				+);
			
 
				+
			
 
				+struct _starpu_mpi_data
			
 
				+{
			
 
				+	int tag;
			
 
				+	int rank;
			
 
				+	MPI_Comm comm;
			
 
				+};
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // __STARPU_MPI_PRIVATE_H__
			
--- a/nmad/src/starpu_mpi_select_node.c
+++ b/nmad/src/starpu_mpi_select_node.c
@@ -0,0 +1,117 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2014, 2015, 2016  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdarg.h>
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <starpu_data.h>
			
 
				+#include <starpu_mpi_private.h>
			
 
				+#include <starpu_mpi_select_node.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+
			
 
				+static int _current_policy = STARPU_MPI_NODE_SELECTION_MOST_R_DATA;
			
 
				+static int _last_predefined_policy = STARPU_MPI_NODE_SELECTION_MOST_R_DATA;
			
 
				+static starpu_mpi_select_node_policy_func_t _policies[_STARPU_MPI_NODE_SELECTION_MAX_POLICY];
			
 
				+
			
 
				+int _starpu_mpi_select_node_with_most_R_data(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data);
			
 
				+
			
 
				+void _starpu_mpi_select_node_init()
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	_policies[STARPU_MPI_NODE_SELECTION_MOST_R_DATA] = _starpu_mpi_select_node_with_most_R_data;
			
 
				+	for(i=_last_predefined_policy+1 ; i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY ; i++)
			
 
				+		_policies[i] = NULL;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_node_selection_get_current_policy()
			
 
				+{
			
 
				+	return _current_policy;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_node_selection_set_current_policy(int policy)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(_policies[policy] != NULL, "Policy %d invalid.\n", policy);
			
 
				+	_current_policy = policy;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_policy_func_t policy_func)
			
 
				+{
			
 
				+	int i=_last_predefined_policy+1;
			
 
				+	// Look for a unregistered policy
			
 
				+	while(i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY)
			
 
				+	{
			
 
				+		if (_policies[i] == NULL) break;
			
 
				+		i++;
			
 
				+	}
			
 
				+	STARPU_ASSERT_MSG(i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY, "No unused policy available. Unregister existing policies before registering a new one.");
			
 
				+	_policies[i] = policy_func;
			
 
				+	return i;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_node_selection_unregister_policy(int policy)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(policy > _last_predefined_policy, "Policy %d invalid. Only user-registered policies can be unregistered\n", policy);
			
 
				+	_policies[policy] = NULL;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_select_node_with_most_R_data(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data)
			
 
				+{
			
 
				+	size_t *size_on_nodes;
			
 
				+	size_t max_size;
			
 
				+	int i;
			
 
				+	int xrank = 0;
			
 
				+
			
 
				+	(void)me;
			
 
				+	_STARPU_MPI_CALLOC(size_on_nodes, nb_nodes, sizeof(size_t));
			
 
				+
			
 
				+	for(i= 0 ; i<nb_data ; i++)
			
 
				+	{
			
 
				+		starpu_data_handle_t data = descr[i].handle;
			
 
				+		enum starpu_data_access_mode mode = descr[i].mode;
			
 
				+		if (mode & STARPU_R)
			
 
				+		{
			
 
				+			int rank = starpu_data_get_rank(data);
			
 
				+			size_on_nodes[rank] += data->ops->get_size(data);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	max_size = 0;
			
 
				+	for(i=0 ; i<nb_nodes ; i++)
			
 
				+	{
			
 
				+		if (size_on_nodes[i] > max_size)
			
 
				+		{
			
 
				+			max_size = size_on_nodes[i];
			
 
				+			xrank = i;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(size_on_nodes);
			
 
				+	return xrank;
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_select_node(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data, int policy)
			
 
				+{
			
 
				+	int ppolicy = policy == STARPU_MPI_NODE_SELECTION_CURRENT_POLICY ? _current_policy : policy;
			
 
				+	STARPU_ASSERT_MSG(ppolicy < _STARPU_MPI_NODE_SELECTION_MAX_POLICY, "Invalid policy %d\n", ppolicy);
			
 
				+	STARPU_ASSERT_MSG(_policies[ppolicy], "Unregistered policy %d\n", ppolicy);
			
 
				+	starpu_mpi_select_node_policy_func_t func = _policies[ppolicy];
			
 
				+	return func(me, nb_nodes, descr, nb_data);
			
 
				+}
			
--- a/nmad/src/starpu_mpi_select_node.h
+++ b/nmad/src/starpu_mpi_select_node.h
@@ -0,0 +1,36 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2014, 2015, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_SELECT_NODE_H__
			
 
				+#define __STARPU_MPI_SELECT_NODE_H__
			
 
				+
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+#define _STARPU_MPI_NODE_SELECTION_MAX_POLICY 24
			
 
				+
			
 
				+void _starpu_mpi_select_node_init();
			
 
				+int _starpu_mpi_select_node(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data, int policy);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // __STARPU_MPI_SELECT_NODE_H__
			
--- a/nmad/src/starpu_mpi_stats.c
+++ b/nmad/src/starpu_mpi_stats.c
@@ -0,0 +1,94 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi_stats.h>
			
 
				+#include <common/config.h>
			
 
				+#include <stdio.h>
			
 
				+#include <starpu_mpi_private.h>
			
 
				+
			
 
				+/* measure the amount of data transfers between each pair of MPI nodes */
			
 
				+static size_t *comm_amount;
			
 
				+static int world_size;
			
 
				+static int stats_enabled=0;
			
 
				+
			
 
				+void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
			
 
				+{
			
 
				+	stats_enabled = starpu_get_env_number("STARPU_COMM_STATS");
			
 
				+	if (stats_enabled == -1)
			
 
				+	{
			
 
				+		stats_enabled = 0;
			
 
				+	}
			
 
				+
			
 
				+	if (stats_enabled == 0) return;
			
 
				+
			
 
				+	if (!_starpu_silent) fprintf(stderr,"Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
			
 
				+
			
 
				+	MPI_Comm_size(comm, &world_size);
			
 
				+	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
			
 
				+
			
 
				+	comm_amount = (size_t *) calloc(world_size, sizeof(size_t));
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_comm_amounts_free()
			
 
				+{
			
 
				+	if (stats_enabled == 0) return;
			
 
				+	free(comm_amount);
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count)
			
 
				+{
			
 
				+	int src, size;
			
 
				+
			
 
				+	if (stats_enabled == 0) return;
			
 
				+
			
 
				+	MPI_Comm_rank(comm, &src);
			
 
				+	MPI_Type_size(datatype, &size);
			
 
				+
			
 
				+	_STARPU_MPI_DEBUG(1, "[%d] adding %d to %d\n", src, count*size, dst);
			
 
				+
			
 
				+	comm_amount[dst] += count*size;
			
 
				+}
			
 
				+
			
 
				+void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
			
 
				+{
			
 
				+	if (stats_enabled == 0) return;
			
 
				+	memcpy(comm_amounts, comm_amount, world_size * sizeof(size_t));
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_comm_amounts_display(int node)
			
 
				+{
			
 
				+	int dst;
			
 
				+	size_t sum = 0;
			
 
				+
			
 
				+	if (stats_enabled == 0) return;
			
 
				+
			
 
				+	for (dst = 0; dst < world_size; dst++)
			
 
				+	{
			
 
				+		sum += comm_amount[dst];
			
 
				+	}
			
 
				+
			
 
				+	fprintf(stderr, "\n[starpu_comm_stats][%d] TOTAL:\t%f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
			
 
				+
			
 
				+	for (dst = 0; dst < world_size; dst++)
			
 
				+	{
			
 
				+		if (comm_amount[dst])
			
 
				+		{
			
 
				+			fprintf(stderr, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\n",
			
 
				+				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
--- a/nmad/src/starpu_mpi_stats.h
+++ b/nmad/src/starpu_mpi_stats.h
@@ -0,0 +1,36 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_STATS_H__
			
 
				+#define __STARPU_MPI_STATS_H__
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
			
 
				+void _starpu_mpi_comm_amounts_free();
			
 
				+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count);
			
 
				+void _starpu_mpi_comm_amounts_display(int node);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // __STARPU_MPI_STATS_H__
			
--- a/nmad/src/starpu_mpi_task_insert.c
+++ b/nmad/src/starpu_mpi_task_insert.c
@@ -0,0 +1,775 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2011-2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2014, 2016 Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdarg.h>
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_data.h>
			
 
				+#include <common/utils.h>
			
 
				+#include <util/starpu_task_insert_utils.h>
			
 
				+#include <datawizard/coherency.h>
			
 
				+#include <core/task.h>
			
 
				+
			
 
				+#include <starpu_mpi_private.h>
			
 
				+#include <starpu_mpi_cache.h>
			
 
				+#include <starpu_mpi_select_node.h>
			
 
				+
			
 
				+#define _SEND_DATA(data, mode, dest, data_tag, comm, callback, arg)     \
			
 
				+	if (mode & STARPU_SSEND)					\
			
 
				+		starpu_mpi_issend_detached(data, dest, data_tag, comm, callback, arg); \
			
 
				+	else								\
			
 
				+		starpu_mpi_isend_detached(data, dest, data_tag, comm, callback, arg);
			
 
				+
			
 
				+static void (*pre_submit_hook)(struct starpu_task *task) = NULL;
			
 
				+
			
 
				+int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *))
			
 
				+{
			
 
				+	if (pre_submit_hook)
			
 
				+		_STARPU_MSG("Warning: a pre_submit_hook has already been registered. Please check if you really want to erase the previously registered hook.\n");
			
 
				+	pre_submit_hook = f;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_pre_submit_hook_unregister()
			
 
				+{
			
 
				+	pre_submit_hook = NULL;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank)
			
 
				+{
			
 
				+	if (mode & STARPU_W)
			
 
				+	{
			
 
				+		if (!data)
			
 
				+		{
			
 
				+			/* We don't have anything allocated for this.
			
 
				+			 * The application knows we won't do anything
			
 
				+			 * about this task */
			
 
				+			/* Yes, the app could actually not call
			
 
				+			 * task_insert at all itself, this is just a
			
 
				+			 * safeguard. */
			
 
				+			_STARPU_MPI_DEBUG(3, "oh oh\n");
			
 
				+			_STARPU_MPI_LOG_OUT();
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+
			
 
				+		int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				+		if (mpi_rank == -1)
			
 
				+		{
			
 
				+			_STARPU_ERROR("Data %p with mode STARPU_W needs to have a valid rank", data);
			
 
				+		}
			
 
				+
			
 
				+		if (*xrank == -1)
			
 
				+		{
			
 
				+			// No node has been selected yet
			
 
				+			*xrank = mpi_rank;
			
 
				+			_STARPU_MPI_DEBUG(100, "Codelet is going to be executed by node %d\n", *xrank);
			
 
				+			*do_execute = (mpi_rank == me);
			
 
				+		}
			
 
				+		else if (mpi_rank != *xrank)
			
 
				+		{
			
 
				+			_STARPU_MPI_DEBUG(100, "Another node %d had already been selected to execute the codelet\n", *xrank);
			
 
				+			*inconsistent_execute = 1;
			
 
				+		}
			
 
				+	}
			
 
				+	_STARPU_MPI_DEBUG(100, "Executing: inconsistent=%d, do_execute=%d, xrank=%d\n", *inconsistent_execute, *do_execute, *xrank);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm)
			
 
				+{
			
 
				+	if (data && mode & STARPU_R)
			
 
				+	{
			
 
				+		int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				+		int data_tag = starpu_mpi_data_get_tag(data);
			
 
				+		if (mpi_rank == -1)
			
 
				+		{
			
 
				+			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
 
				+		}
			
 
				+
			
 
				+		if (do_execute && mpi_rank != me)
			
 
				+		{
			
 
				+			/* The node is going to execute the codelet, but it does not own the data, it needs to receive the data from the owner node */
			
 
				+			int already_received = _starpu_mpi_cache_received_data_set(data);
			
 
				+			if (already_received == 0)
			
 
				+			{
			
 
				+				if (data_tag == -1)
			
 
				+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				+				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
			
 
				+				starpu_mpi_irecv_detached(data, mpi_rank, data_tag, comm, NULL, NULL);
			
 
				+			}
			
 
				+			// else the node has already received the data
			
 
				+		}
			
 
				+
			
 
				+		if (!do_execute && mpi_rank == me)
			
 
				+		{
			
 
				+			/* The node owns the data, but another node is going to execute the codelet, the node needs to send the data to the executee node. */
			
 
				+			int already_sent = _starpu_mpi_cache_sent_data_set(data, xrank);
			
 
				+			if (already_sent == 0)
			
 
				+			{
			
 
				+				if (data_tag == -1)
			
 
				+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				+				_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data, xrank);
			
 
				+				_SEND_DATA(data, mode, xrank, data_tag, comm, NULL, NULL);
			
 
				+			}
			
 
				+			// Else the data has already been sent
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm)
			
 
				+{
			
 
				+	if (mode & STARPU_W)
			
 
				+	{
			
 
				+		int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				+		int data_tag = starpu_mpi_data_get_tag(data);
			
 
				+		if(mpi_rank == -1)
			
 
				+		{
			
 
				+			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
 
				+		}
			
 
				+		if (mpi_rank == me)
			
 
				+		{
			
 
				+			if (xrank != -1 && me != xrank)
			
 
				+			{
			
 
				+				_STARPU_MPI_DEBUG(1, "Receive data %p back from the task %d which executed the codelet ...\n", data, xrank);
			
 
				+				if(data_tag == -1)
			
 
				+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				+				starpu_mpi_irecv_detached(data, xrank, data_tag, comm, NULL, NULL);
			
 
				+			}
			
 
				+		}
			
 
				+		else if (do_execute)
			
 
				+		{
			
 
				+			if(data_tag == -1)
			
 
				+				_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				+			_STARPU_MPI_DEBUG(1, "Send data %p back to its owner %d...\n", data, mpi_rank);
			
 
				+			_SEND_DATA(data, mode, mpi_rank, data_tag, comm, NULL, NULL);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int do_execute)
			
 
				+{
			
 
				+	if (_starpu_cache_enabled)
			
 
				+	{
			
 
				+		if (mode & STARPU_W || mode & STARPU_REDUX)
			
 
				+		{
			
 
				+			/* The data has been modified, it MUST be removed from the cache */
			
 
				+			_starpu_mpi_cache_sent_data_clear(data);
			
 
				+			_starpu_mpi_cache_received_data_clear(data);
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* We allocated a temporary buffer for the received data, now drop it */
			
 
				+		if ((mode & STARPU_R) && do_execute)
			
 
				+		{
			
 
				+			int mpi_rank = starpu_mpi_data_get_rank(data);
			
 
				+			if (mpi_rank != me && mpi_rank != -1)
			
 
				+			{
			
 
				+				starpu_data_invalidate_submit(data);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, va_list varg_list)
			
 
				+{
			
 
				+	va_list varg_list_copy;
			
 
				+	int inconsistent_execute = 0;
			
 
				+	int arg_type;
			
 
				+	int node_selected = 0;
			
 
				+	int nb_allocated_data = 16;
			
 
				+	struct starpu_data_descr *descrs;
			
 
				+	int nb_data;
			
 
				+	int select_node_policy = STARPU_MPI_NODE_SELECTION_CURRENT_POLICY;
			
 
				+
			
 
				+	_STARPU_TRACE_TASK_MPI_DECODE_START();
			
 
				+
			
 
				+	_STARPU_MPI_MALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
			
 
				+	nb_data = 0;
			
 
				+	*do_execute = -1;
			
 
				+	*xrank = -1;
			
 
				+
			
 
				+	va_copy(varg_list_copy, varg_list);
			
 
				+	while ((arg_type = va_arg(varg_list_copy, int)) != 0)
			
 
				+	{
			
 
				+		int arg_type_nocommute = arg_type & ~STARPU_COMMUTE;
			
 
				+		if (arg_type==STARPU_EXECUTE_ON_NODE)
			
 
				+		{
			
 
				+			*xrank = va_arg(varg_list_copy, int);
			
 
				+			if (node_selected == 0)
			
 
				+			{
			
 
				+				_STARPU_MPI_DEBUG(100, "Executing on node %d\n", *xrank);
			
 
				+				*do_execute = 1;
			
 
				+				node_selected = 1;
			
 
				+				inconsistent_execute = 0;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
			
 
				+		{
			
 
				+			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
			
 
				+			if (node_selected == 0)
			
 
				+			{
			
 
				+				*xrank = starpu_mpi_data_get_rank(data);
			
 
				+				STARPU_ASSERT_MSG(*xrank != -1, "Rank of the data must be set using starpu_mpi_data_register() or starpu_data_set_rank()");
			
 
				+				_STARPU_MPI_DEBUG(100, "Executing on data node %d\n", *xrank);
			
 
				+				STARPU_ASSERT_MSG(*xrank <= nb_nodes, "Node %d to execute codelet is not a valid node (%d)", *xrank, nb_nodes);
			
 
				+				*do_execute = 1;
			
 
				+				node_selected = 1;
			
 
				+				inconsistent_execute = 0;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
			
 
				+		{
			
 
				+			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
			
 
				+			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
			
 
				+			if (node_selected == 0)
			
 
				+			{
			
 
				+				int ret = _starpu_mpi_find_executee_node(data, mode, me, do_execute, &inconsistent_execute, xrank);
			
 
				+				if (ret == -EINVAL)
			
 
				+				{
			
 
				+					free(descrs);
			
 
				+					va_end(varg_list_copy);
			
 
				+					_STARPU_TRACE_TASK_MPI_DECODE_END();
			
 
				+					return ret;
			
 
				+				}
			
 
				+			}
			
 
				+			if (nb_data >= nb_allocated_data)
			
 
				+			{
			
 
				+				nb_allocated_data *= 2;
			
 
				+				_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
			
 
				+			}
			
 
				+			descrs[nb_data].handle = data;
			
 
				+			descrs[nb_data].mode = mode;
			
 
				+			nb_data ++;
			
 
				+		}
			
 
				+		else if (arg_type == STARPU_DATA_ARRAY)
			
 
				+		{
			
 
				+			starpu_data_handle_t *datas = va_arg(varg_list_copy, starpu_data_handle_t *);
			
 
				+			int nb_handles = va_arg(varg_list_copy, int);
			
 
				+			int i;
			
 
				+
			
 
				+			for(i=0 ; i<nb_handles ; i++)
			
 
				+			{
			
 
				+				STARPU_ASSERT_MSG(codelet->nbuffers == STARPU_VARIABLE_NBUFFERS || nb_data < codelet->nbuffers, "Too many data passed to starpu_mpi_task_insert");
			
 
				+				enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(codelet, nb_data);
			
 
				+				if (node_selected == 0)
			
 
				+				{
			
 
				+					int ret = _starpu_mpi_find_executee_node(datas[i], mode, me, do_execute, &inconsistent_execute, xrank);
			
 
				+					if (ret == -EINVAL)
			
 
				+					{
			
 
				+						free(descrs);
			
 
				+						va_end(varg_list_copy);
			
 
				+						_STARPU_TRACE_TASK_MPI_DECODE_END();
			
 
				+						return ret;
			
 
				+					}
			
 
				+				}
			
 
				+				if (nb_data >= nb_allocated_data)
			
 
				+				{
			
 
				+					nb_allocated_data *= 2;
			
 
				+					_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
			
 
				+				}
			
 
				+				descrs[nb_data].handle = datas[i];
			
 
				+				descrs[nb_data].mode = mode;
			
 
				+				nb_data ++;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (arg_type == STARPU_DATA_MODE_ARRAY)
			
 
				+		{
			
 
				+			struct starpu_data_descr *_descrs = va_arg(varg_list_copy, struct starpu_data_descr*);
			
 
				+			int nb_handles = va_arg(varg_list_copy, int);
			
 
				+			int i;
			
 
				+
			
 
				+			for(i=0 ; i<nb_handles ; i++)
			
 
				+			{
			
 
				+				enum starpu_data_access_mode mode = _descrs[i].mode;
			
 
				+				if (node_selected == 0)
			
 
				+				{
			
 
				+					int ret = _starpu_mpi_find_executee_node(_descrs[i].handle, mode, me, do_execute, &inconsistent_execute, xrank);
			
 
				+					if (ret == -EINVAL)
			
 
				+					{
			
 
				+						free(descrs);
			
 
				+						va_end(varg_list_copy);
			
 
				+						_STARPU_TRACE_TASK_MPI_DECODE_END();
			
 
				+						return ret;
			
 
				+					}
			
 
				+				}
			
 
				+				if (nb_data >= nb_allocated_data)
			
 
				+				{
			
 
				+					nb_allocated_data *= 2;
			
 
				+					_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
			
 
				+				}
			
 
				+				descrs[nb_data].handle = _descrs[i].handle;
			
 
				+				descrs[nb_data].mode = mode;
			
 
				+				nb_data ++;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_VALUE)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, void *);
			
 
				+			(void)va_arg(varg_list_copy, size_t);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_CL_ARGS)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, void *);
			
 
				+			(void)va_arg(varg_list, size_t);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_CALLBACK)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
			
 
				+			(void)va_arg(varg_list_copy, void *);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_CALLBACK_ARG)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, void *);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_PRIORITY)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, int);
			
 
				+		}
			
 
				+		/* STARPU_EXECUTE_ON_NODE handled above */
			
 
				+		/* STARPU_EXECUTE_ON_DATA handled above */
			
 
				+		/* STARPU_DATA_ARRAY handled above */
			
 
				+		/* STARPU_DATA_MODE_ARRAY handled above */
			
 
				+		else if (arg_type==STARPU_TAG)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, starpu_tag_t);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_HYPERVISOR_TAG)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, int);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_FLOPS)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, double);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_SCHED_CTX)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, unsigned);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
			
 
				+                {
			
 
				+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
			
 
				+		}
			
 
				+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
			
 
				+                {
			
 
				+                        (void)va_arg(varg_list_copy, void *);
			
 
				+                }
			
 
				+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
			
 
				+                {
			
 
				+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
			
 
				+                }
			
 
				+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
			
 
				+                {
			
 
				+                        (void)va_arg(varg_list_copy, void *);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_EXECUTE_ON_WORKER)
			
 
				+		{
			
 
				+			// the flag is decoded and set later when
			
 
				+			// calling function _starpu_task_insert_create()
			
 
				+			(void)va_arg(varg_list_copy, int);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_TAG_ONLY)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, starpu_tag_t);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_NAME)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, const char *);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list_copy, unsigned);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_WORKER_ORDER)
			
 
				+		{
			
 
				+			// the flag is decoded and set later when
			
 
				+			// calling function _starpu_task_insert_create()
			
 
				+			(void)va_arg(varg_list_copy, unsigned);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_NODE_SELECTION_POLICY)
			
 
				+		{
			
 
				+			select_node_policy = va_arg(varg_list_copy, int);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+	va_end(varg_list_copy);
			
 
				+
			
 
				+	if (inconsistent_execute == 1 || *xrank == -1)
			
 
				+	{
			
 
				+		// We need to find out which node is going to execute the codelet.
			
 
				+		_STARPU_MPI_DISP("Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
			
 
				+		*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
			
 
				+		*do_execute = (me == *xrank);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(100, "Inconsistent=%d - xrank=%d\n", inconsistent_execute, *xrank);
			
 
				+		*do_execute = (me == *xrank);
			
 
				+	}
			
 
				+	_STARPU_MPI_DEBUG(100, "do_execute=%d\n", *do_execute);
			
 
				+
			
 
				+	*descrs_p = descrs;
			
 
				+	*nb_data_p = nb_data;
			
 
				+
			
 
				+	_STARPU_TRACE_TASK_MPI_DECODE_END();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, va_list varg_list)
			
 
				+{
			
 
				+	int me, do_execute, xrank, nb_nodes;
			
 
				+	int ret;
			
 
				+	int i;
			
 
				+	struct starpu_data_descr *descrs;
			
 
				+	int nb_data;
			
 
				+
			
 
				+	_STARPU_MPI_LOG_IN();
			
 
				+
			
 
				+	starpu_mpi_comm_rank(comm, &me);
			
 
				+	starpu_mpi_comm_size(comm, &nb_nodes);
			
 
				+
			
 
				+	/* Find out whether we are to execute the data because we own the data to be written to. */
			
 
				+	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
			
 
				+	if (ret < 0) return ret;
			
 
				+
			
 
				+	_STARPU_TRACE_TASK_MPI_PRE_START();
			
 
				+	/* Send and receive data as requested */
			
 
				+	for(i=0 ; i<nb_data ; i++)
			
 
				+	{
			
 
				+		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
			
 
				+	}
			
 
				+
			
 
				+	if (xrank_p) *xrank_p = xrank;
			
 
				+	if (nb_data_p) *nb_data_p = nb_data;
			
 
				+	if (descrs_p)
			
 
				+		*descrs_p = descrs;
			
 
				+	else
			
 
				+		free(descrs);
			
 
				+	_STARPU_TRACE_TASK_MPI_PRE_END();
			
 
				+
			
 
				+	if (do_execute == 0) return 1;
			
 
				+	else
			
 
				+	{
			
 
				+		va_list varg_list_copy;
			
 
				+		_STARPU_MPI_DEBUG(100, "Execution of the codelet %p (%s)\n", codelet, codelet?codelet->name:NULL);
			
 
				+
			
 
				+		*task = starpu_task_create();
			
 
				+		(*task)->cl_arg_free = 1;
			
 
				+
			
 
				+		va_copy(varg_list_copy, varg_list);
			
 
				+		_starpu_task_insert_create(codelet, task, varg_list_copy);
			
 
				+		va_end(varg_list_copy);
			
 
				+
			
 
				+		return 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data)
			
 
				+{
			
 
				+	int me, i;
			
 
				+
			
 
				+	_STARPU_TRACE_TASK_MPI_POST_START();
			
 
				+	starpu_mpi_comm_rank(comm, &me);
			
 
				+
			
 
				+	for(i=0 ; i<nb_data ; i++)
			
 
				+	{
			
 
				+		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
			
 
				+		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
			
 
				+	}
			
 
				+
			
 
				+	free(descrs);
			
 
				+
			
 
				+	_STARPU_TRACE_TASK_MPI_POST_END();
			
 
				+	_STARPU_MPI_LOG_OUT();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_list varg_list)
			
 
				+{
			
 
				+	struct starpu_task *task;
			
 
				+	int ret;
			
 
				+	int xrank;
			
 
				+	int do_execute = 0;
			
 
				+	struct starpu_data_descr *descrs;
			
 
				+	int nb_data;
			
 
				+
			
 
				+	ret = _starpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, varg_list);
			
 
				+	if (ret < 0) return ret;
			
 
				+
			
 
				+	if (ret == 0)
			
 
				+	{
			
 
				+		do_execute = 1;
			
 
				+		ret = starpu_task_submit(task);
			
 
				+
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			_STARPU_MSG("submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
			
 
				+				    task, task->cl,
			
 
				+				    (codelet == NULL) ? "none" :
			
 
				+				    task->cl->name ? task->cl->name :
			
 
				+				    (task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
			
 
				+
			
 
				+			task->destroy = 0;
			
 
				+			starpu_task_destroy(task);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int val = _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
			
 
				+
			
 
				+	if (ret == 0 && pre_submit_hook)
			
 
				+		pre_submit_hook(task);
			
 
				+
			
 
				+	return val;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				+{
			
 
				+	va_list varg_list;
			
 
				+	int ret;
			
 
				+
			
 
				+	va_start(varg_list, codelet);
			
 
				+	ret = _starpu_mpi_task_insert_v(comm, codelet, varg_list);
			
 
				+	va_end(varg_list);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				+{
			
 
				+	va_list varg_list;
			
 
				+	int ret;
			
 
				+
			
 
				+	va_start(varg_list, codelet);
			
 
				+	ret = _starpu_mpi_task_insert_v(comm, codelet, varg_list);
			
 
				+	va_end(varg_list);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				+{
			
 
				+	va_list varg_list;
			
 
				+	struct starpu_task *task;
			
 
				+	int ret;
			
 
				+
			
 
				+	va_start(varg_list, codelet);
			
 
				+	ret = _starpu_mpi_task_build_v(comm, codelet, &task, NULL, NULL, NULL, varg_list);
			
 
				+	va_end(varg_list);
			
 
				+	STARPU_ASSERT(ret >= 0);
			
 
				+	if (ret > 0) return NULL; else return task;
			
 
				+}
			
 
				+
			
 
				+int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
			
 
				+{
			
 
				+	int xrank, do_execute;
			
 
				+	int ret, me, nb_nodes;
			
 
				+	va_list varg_list;
			
 
				+	struct starpu_data_descr *descrs;
			
 
				+	int nb_data;
			
 
				+
			
 
				+	starpu_mpi_comm_rank(comm, &me);
			
 
				+	starpu_mpi_comm_size(comm, &nb_nodes);
			
 
				+
			
 
				+	va_start(varg_list, codelet);
			
 
				+	/* Find out whether we are to execute the data because we own the data to be written to. */
			
 
				+	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
			
 
				+	va_end(varg_list);
			
 
				+	if (ret < 0) return ret;
			
 
				+
			
 
				+	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
			
 
				+}
			
 
				+
			
 
				+struct _starpu_mpi_redux_data_args
			
 
				+{
			
 
				+	starpu_data_handle_t data_handle;
			
 
				+	starpu_data_handle_t new_handle;
			
 
				+	int tag;
			
 
				+	int node;
			
 
				+	MPI_Comm comm;
			
 
				+	struct starpu_task *taskB;
			
 
				+};
			
 
				+
			
 
				+void _starpu_mpi_redux_data_dummy_func(STARPU_ATTRIBUTE_UNUSED void *buffers[], STARPU_ATTRIBUTE_UNUSED void *cl_arg)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+/* Dummy cost function for simgrid */
			
 
				+static double cost_function(struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, unsigned nimpl STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	return 0.000001;
			
 
				+}
			
 
				+static struct starpu_perfmodel dumb_model =
			
 
				+{
			
 
				+	.type		= STARPU_COMMON,
			
 
				+	.cost_function	= cost_function
			
 
				+};
			
 
				+
			
 
				+static
			
 
				+struct starpu_codelet _starpu_mpi_redux_data_read_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func},
			
 
				+	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func},
			
 
				+	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R},
			
 
				+	.model = &dumb_model,
			
 
				+	.name = "_starpu_mpi_redux_data_read_cl"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet _starpu_mpi_redux_data_readwrite_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func},
			
 
				+	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func},
			
 
				+	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+	.model = &dumb_model,
			
 
				+	.name = "_starpu_mpi_redux_data_write_cl"
			
 
				+};
			
 
				+
			
 
				+static
			
 
				+void _starpu_mpi_redux_data_detached_callback(void *arg)
			
 
				+{
			
 
				+	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) arg;
			
 
				+
			
 
				+	STARPU_TASK_SET_HANDLE(args->taskB, args->new_handle, 1);
			
 
				+	int ret = starpu_task_submit(args->taskB);
			
 
				+	STARPU_ASSERT(ret == 0);
			
 
				+
			
 
				+	starpu_data_unregister_submit(args->new_handle);
			
 
				+	free(args);
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
			
 
				+{
			
 
				+	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) callback_arg;
			
 
				+	starpu_data_register_same(&args->new_handle, args->data_handle);
			
 
				+
			
 
				+	starpu_mpi_irecv_detached_sequential_consistency(args->new_handle, args->node, args->tag, args->comm, _starpu_mpi_redux_data_detached_callback, args, 0);
			
 
				+}
			
 
				+
			
 
				+/* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
			
 
				+ * a data previously accessed in REDUX mode gets accessed in R mode. */
			
 
				+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
			
 
				+{
			
 
				+	int me, rank, tag, nb_nodes;
			
 
				+
			
 
				+	rank = starpu_mpi_data_get_rank(data_handle);
			
 
				+	tag = starpu_mpi_data_get_tag(data_handle);
			
 
				+	if (rank == -1)
			
 
				+	{
			
 
				+		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
			
 
				+	}
			
 
				+	if (tag == -1)
			
 
				+	{
			
 
				+		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_comm_rank(comm, &me);
			
 
				+	starpu_mpi_comm_size(comm, &nb_nodes);
			
 
				+
			
 
				+	_STARPU_MPI_DEBUG(1, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
			
 
				+
			
 
				+	// need to count how many nodes have the data in redux mode
			
 
				+	if (me == rank)
			
 
				+	{
			
 
				+		int i, j=0;
			
 
				+		struct starpu_task *taskBs[nb_nodes];
			
 
				+
			
 
				+		for(i=0 ; i<nb_nodes ; i++)
			
 
				+		{
			
 
				+			if (i != rank)
			
 
				+			{
			
 
				+				/* We need to make sure all is
			
 
				+				 * executed after data_handle finished
			
 
				+				 * its last read access, we hence do
			
 
				+				 * the following:
			
 
				+				 * - submit an empty task A reading
			
 
				+				 * data_handle whose callback submits
			
 
				+				 * the mpi comm with sequential
			
 
				+				 * consistency set to 0, whose
			
 
				+				 * callback submits the redux_cl task
			
 
				+				 * B with sequential consistency set
			
 
				+				 * to 0,
			
 
				+				 * - submit an empty task C reading
			
 
				+				 * and writing data_handle and
			
 
				+				 * depending on task B, just to replug
			
 
				+				 * with implicit data dependencies
			
 
				+				 * with tasks inserted after this
			
 
				+				 * reduction.
			
 
				+				 */
			
 
				+
			
 
				+				struct _starpu_mpi_redux_data_args *args;
			
 
				+				_STARPU_MPI_MALLOC(args, sizeof(struct _starpu_mpi_redux_data_args));
			
 
				+				args->data_handle = data_handle;
			
 
				+				args->tag = tag;
			
 
				+				args->node = i;
			
 
				+				args->comm = comm;
			
 
				+
			
 
				+				// We need to create taskB early as
			
 
				+				// taskC declares a dependancy on it
			
 
				+				args->taskB = starpu_task_create();
			
 
				+				args->taskB->cl = args->data_handle->redux_cl;
			
 
				+				args->taskB->sequential_consistency = 0;
			
 
				+				STARPU_TASK_SET_HANDLE(args->taskB, args->data_handle, 0);
			
 
				+				taskBs[j] = args->taskB; j++;
			
 
				+
			
 
				+				// Submit taskA
			
 
				+				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
			
 
				+						   STARPU_R, data_handle,
			
 
				+						   STARPU_CALLBACK_WITH_ARG, _starpu_mpi_redux_data_recv_callback, args,
			
 
				+						   0);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		// Submit taskC which depends on all taskBs created
			
 
				+		struct starpu_task *taskC = starpu_task_create();
			
 
				+		taskC->cl = &_starpu_mpi_redux_data_readwrite_cl;
			
 
				+		STARPU_TASK_SET_HANDLE(taskC, data_handle, 0);
			
 
				+		starpu_task_declare_deps_array(taskC, j, taskBs);
			
 
				+		int ret = starpu_task_submit(taskC);
			
 
				+		STARPU_ASSERT(ret == 0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
			
 
				+		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
			
 
				+		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
			
 
				+	}
			
 
				+	/* FIXME: In order to prevent simultaneous receive submissions
			
 
				+	 * on the same handle, we need to wait that all the starpu_mpi
			
 
				+	 * tasks are done before submitting next tasks. The current
			
 
				+	 * version of the implementation does not support multiple
			
 
				+	 * simultaneous receive requests on the same handle.*/
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+}
			
--- a/nmad/src/starpu_mpi_task_insert.h
+++ b/nmad/src/starpu_mpi_task_insert.h
@@ -0,0 +1,32 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_TASK_INSERT_H__
			
 
				+#define __STARPU_MPI_TASK_INSERT_H__
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank);
			
 
				+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm);
			
 
				+int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+#endif /* __STARPU_MPI_TASK_INSERT_H__ */
			
--- a/nmad/starpumpi-1.0.pc.in
+++ b/nmad/starpumpi-1.0.pc.in
@@ -0,0 +1,29 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2011, 2016  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012  CNRS
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+Name: starpumpi
			
 
				+Description: offers MPI support for heterogeneous multicore architecture
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
			
 
				+Requires: starpu-1.0
			
 
				+Requires.private:
			
--- a/nmad/starpumpi-1.1.pc.in
+++ b/nmad/starpumpi-1.1.pc.in
@@ -0,0 +1,29 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2011, 2013, 2016  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012  CNRS
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+Name: starpumpi
			
 
				+Description: offers MPI support for heterogeneous multicore architecture
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
			
 
				+Requires: starpu-1.1
			
 
				+Requires.private:
			
--- a/nmad/starpumpi-1.2.pc.in
+++ b/nmad/starpumpi-1.2.pc.in
@@ -0,0 +1,29 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2011, 2013, 2016  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012  CNRS
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+Name: starpumpi
			
 
				+Description: offers MPI support for heterogeneous multicore architecture
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
			
 
				+Requires: starpu-1.2
			
 
				+Requires.private:
			
--- a/nmad/starpumpi-1.3.pc.in
+++ b/nmad/starpumpi-1.3.pc.in
@@ -0,0 +1,29 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2011, 2013, 2015-2016  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012  CNRS
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+prefix=@prefix@
			
 
				+exec_prefix=@exec_prefix@
			
 
				+libdir=@libdir@
			
 
				+includedir=@includedir@
			
 
				+
			
 
				+Name: starpumpi
			
 
				+Description: offers MPI support for heterogeneous multicore architecture
			
 
				+Version: @PACKAGE_VERSION@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
			
 
				+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
			
 
				+Requires: starpu-1.3
			
 
				+Requires.private:
			
--- a/nmad/tests/.gitignore
+++ b/nmad/tests/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/nmad/tests/Makefile.am
+++ b/nmad/tests/Makefile.am
@@ -0,0 +1,246 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2012, 2016  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+include $(top_srcdir)/starpu.mk
			
 
				+
			
 
				+CC=$(MPICC)
			
 
				+CCLD=$(MPICC)
			
 
				+
			
 
				+if STARPU_HAVE_WINDOWS
			
 
				+LOADER_BIN		=
			
 
				+else
			
 
				+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
			
 
				+LOADER			=	loader
			
 
				+LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
			
 
				+loader_SOURCES		=	../../tests/loader.c
			
 
				+endif
			
 
				+
			
 
				+if STARPU_QUICK_CHECK
			
 
				+MPI			=	$(MPIEXEC) -np 2
			
 
				+else
			
 
				+MPI			=	$(MPIEXEC) -np 4
			
 
				+endif
			
 
				+
			
 
				+if STARPU_HAVE_AM111
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
			
 
				+else
			
 
				+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+if STARPU_MPI_CHECK
			
 
				+TESTS			=	$(starpu_mpi_TESTS)
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+check_PROGRAMS = $(LOADER) $(starpu_mpi_TESTS)
			
 
				+
			
 
				+BUILT_SOURCES =
			
 
				+
			
 
				+CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				+
			
 
				+EXTRA_DIST = 					\
			
 
				+	user_defined_datatype_value.h
			
 
				+
			
 
				+examplebindir = $(libdir)/starpu/examples/mpi
			
 
				+
			
 
				+examplebin_PROGRAMS =
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
			
 
				+
			
 
				+.cu.cubin:
			
 
				+	$(MKDIR_P) `dirname $@`
			
 
				+	$(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
			
 
				+
			
 
				+.cu.o:
			
 
				+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
			
 
				+endif
			
 
				+
			
 
				+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
			
 
				+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
			
 
				+
			
 
				+########################
			
 
				+# Unit testcases       #
			
 
				+########################
			
 
				+
			
 
				+starpu_mpi_TESTS =				\
			
 
				+	datatypes				\
			
 
				+	pingpong				\
			
 
				+	mpi_test				\
			
 
				+	mpi_isend				\
			
 
				+	mpi_irecv				\
			
 
				+	mpi_isend_detached			\
			
 
				+	mpi_irecv_detached			\
			
 
				+	mpi_detached_tag			\
			
 
				+	mpi_redux				\
			
 
				+	ring					\
			
 
				+	ring_sync				\
			
 
				+	ring_sync_detached			\
			
 
				+	ring_async				\
			
 
				+	ring_async_implicit			\
			
 
				+	block_interface				\
			
 
				+	block_interface_pinned			\
			
 
				+	cache					\
			
 
				+	cache_disable				\
			
 
				+	matrix					\
			
 
				+	matrix2					\
			
 
				+	insert_task				\
			
 
				+	insert_task_cache			\
			
 
				+	insert_task_compute			\
			
 
				+	insert_task_sent_cache			\
			
 
				+	insert_task_recv_cache			\
			
 
				+	insert_task_block			\
			
 
				+	insert_task_owner			\
			
 
				+	insert_task_owner2			\
			
 
				+	insert_task_owner_data			\
			
 
				+	insert_task_count			\
			
 
				+	multiple_send				\
			
 
				+	mpi_scatter_gather			\
			
 
				+	mpi_reduction				\
			
 
				+	user_defined_datatype			\
			
 
				+	comm
			
 
				+
			
 
				+noinst_PROGRAMS =				\
			
 
				+	datatypes				\
			
 
				+	pingpong				\
			
 
				+	mpi_test				\
			
 
				+	mpi_isend				\
			
 
				+	mpi_irecv				\
			
 
				+	mpi_isend_detached			\
			
 
				+	mpi_irecv_detached			\
			
 
				+	mpi_detached_tag			\
			
 
				+	mpi_redux				\
			
 
				+	ring					\
			
 
				+	ring_sync				\
			
 
				+	ring_sync_detached			\
			
 
				+	ring_async				\
			
 
				+	ring_async_implicit			\
			
 
				+	block_interface				\
			
 
				+	block_interface_pinned			\
			
 
				+	cache					\
			
 
				+	cache_disable				\
			
 
				+	matrix					\
			
 
				+	matrix2					\
			
 
				+	insert_task				\
			
 
				+	insert_task_cache			\
			
 
				+	insert_task_compute			\
			
 
				+	insert_task_sent_cache			\
			
 
				+	insert_task_recv_cache			\
			
 
				+	insert_task_block			\
			
 
				+	insert_task_owner			\
			
 
				+	insert_task_owner2			\
			
 
				+	insert_task_owner_data			\
			
 
				+	insert_task_count			\
			
 
				+	multiple_send				\
			
 
				+	mpi_scatter_gather			\
			
 
				+	mpi_reduction				\
			
 
				+	user_defined_datatype			\
			
 
				+	comm
			
 
				+
			
 
				+mpi_isend_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_irecv_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_isend_detached_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_irecv_detached_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_detached_tag_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_redux_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+datatypes_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+pingpong_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_test_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+ring_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+ring_sync_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+ring_sync_detached_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+ring_async_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+ring_async_implicit_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+block_interface_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+block_interface_pinned_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+cache_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+cache_disable_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+matrix_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+matrix2_LDADD =					\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_cache_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_compute_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_sent_cache_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_recv_cache_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_block_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_owner_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_owner2_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_owner_data_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+insert_task_count_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+multiple_send_LDADD =				\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_scatter_gather_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+mpi_reduction_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+user_defined_datatype_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+comm_LDADD =			\
			
 
				+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+
			
 
				+ring_SOURCES = ring.c
			
 
				+ring_sync_SOURCES = ring_sync.c
			
 
				+ring_sync_detached_SOURCES = ring_sync_detached.c
			
 
				+ring_async_SOURCES = ring_async.c
			
 
				+ring_async_implicit_SOURCES = ring_async_implicit.c
			
 
				+insert_task_count_SOURCES = insert_task_count.c
			
 
				+if STARPU_USE_CUDA
			
 
				+ring_SOURCES += ring_kernel.cu
			
 
				+ring_sync_SOURCES += ring_kernel.cu
			
 
				+ring_sync_detached_SOURCES += ring_kernel.cu
			
 
				+ring_async_SOURCES += ring_kernel.cu
			
 
				+ring_async_implicit_SOURCES += ring_kernel.cu
			
 
				+insert_task_count_SOURCES += ring_kernel.cu
			
 
				+endif
			
 
				+mpi_reduction_SOURCES = mpi_reduction.c
			
 
				+mpi_reduction_SOURCES += mpi_reduction_kernels.c
			
 
				+user_defined_datatype_SOURCES = user_defined_datatype.c
			
 
				+user_defined_datatype_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
			
 
				+
			
--- a/nmad/tests/block_interface.c
+++ b/nmad/tests/block_interface.c
@@ -0,0 +1,146 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#define NITER	2048
			
 
				+
			
 
				+#define BIGSIZE	128
			
 
				+#define SIZE	64
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	/* Node 0 will allocate a big block and only register an inner part of
			
 
				+	 * it as the block data, Node 1 will allocate a block of small size and
			
 
				+	 * register it directly. Node 0 and 1 will then exchange the content of
			
 
				+	 * their blocks. */
			
 
				+
			
 
				+	float *block = NULL;
			
 
				+	starpu_data_handle_t block_handle;
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		block = calloc(BIGSIZE*BIGSIZE*BIGSIZE, sizeof(float));
			
 
				+		assert(block);
			
 
				+
			
 
				+		/* fill the inner block */
			
 
				+		unsigned i, j, k;
			
 
				+		for (k = 0; k < SIZE; k++)
			
 
				+		for (j = 0; j < SIZE; j++)
			
 
				+		for (i = 0; i < SIZE; i++)
			
 
				+		{
			
 
				+			block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] = 1.0f;
			
 
				+		}
			
 
				+
			
 
				+		starpu_block_data_register(&block_handle, 0,
			
 
				+			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
			
 
				+			SIZE, SIZE, SIZE, sizeof(float));
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		block = calloc(SIZE*SIZE*SIZE, sizeof(float));
			
 
				+		assert(block);
			
 
				+
			
 
				+		starpu_block_data_register(&block_handle, 0,
			
 
				+			(uintptr_t)block, SIZE, SIZE*SIZE,
			
 
				+			SIZE, SIZE, SIZE, sizeof(float));
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
			
 
				+
			
 
				+		MPI_Status status;
			
 
				+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
			
 
				+
			
 
				+		/* check the content of the block */
			
 
				+		ret = starpu_data_acquire(block_handle, STARPU_R);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+
			
 
				+		unsigned i, j, k;
			
 
				+		for (k = 0; k < SIZE; k++)
			
 
				+		for (j = 0; j < SIZE; j++)
			
 
				+		for (i = 0; i < SIZE; i++)
			
 
				+		{
			
 
				+			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
			
 
				+		}
			
 
				+		starpu_data_release(block_handle);
			
 
				+
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		MPI_Status status;
			
 
				+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
			
 
				+
			
 
				+		/* check the content of the block and modify it */
			
 
				+		ret = starpu_data_acquire(block_handle, STARPU_RW);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+
			
 
				+		unsigned i, j, k;
			
 
				+		for (k = 0; k < SIZE; k++)
			
 
				+		for (j = 0; j < SIZE; j++)
			
 
				+		for (i = 0; i < SIZE; i++)
			
 
				+		{
			
 
				+			assert(block[i + j*SIZE + k*SIZE*SIZE] == 1.0f);
			
 
				+			block[i + j*SIZE + k*SIZE*SIZE] = 33.0f;
			
 
				+		}
			
 
				+		starpu_data_release(block_handle);
			
 
				+
			
 
				+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stdout, "Rank %d is done\n", rank);
			
 
				+	fflush(stdout);
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		starpu_data_unregister(block_handle);
			
 
				+		free(block);
			
 
				+	}
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/block_interface_pinned.c
+++ b/nmad/tests/block_interface_pinned.c
@@ -0,0 +1,150 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#define NITER	2048
			
 
				+
			
 
				+#define BIGSIZE	64
			
 
				+#define SIZE	64
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	/* Node 0 will allocate a big block and only register an inner part of
			
 
				+	 * it as the block data, Node 1 will allocate a block of small size and
			
 
				+	 * register it directly. Node 0 and 1 will then exchange the content of
			
 
				+	 * their blocks. */
			
 
				+
			
 
				+	float *block;
			
 
				+	starpu_data_handle_t block_handle;
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		starpu_malloc((void **)&block,
			
 
				+				BIGSIZE*BIGSIZE*BIGSIZE*sizeof(float));
			
 
				+		memset(block, 0, BIGSIZE*BIGSIZE*BIGSIZE*sizeof(float));
			
 
				+
			
 
				+		/* fill the inner block */
			
 
				+		unsigned i, j, k;
			
 
				+		for (k = 0; k < SIZE; k++)
			
 
				+		for (j = 0; j < SIZE; j++)
			
 
				+		for (i = 0; i < SIZE; i++)
			
 
				+		{
			
 
				+			block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] = 1.0f;
			
 
				+		}
			
 
				+
			
 
				+		starpu_block_data_register(&block_handle, 0,
			
 
				+			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
			
 
				+			SIZE, SIZE, SIZE, sizeof(float));
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		starpu_malloc((void **)&block,
			
 
				+			SIZE*SIZE*SIZE*sizeof(float));
			
 
				+		memset(block, 0, SIZE*SIZE*SIZE*sizeof(float));
			
 
				+
			
 
				+		starpu_block_data_register(&block_handle, 0,
			
 
				+			(uintptr_t)block, SIZE, SIZE*SIZE,
			
 
				+			SIZE, SIZE, SIZE, sizeof(float));
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		MPI_Status status;
			
 
				+
			
 
				+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
			
 
				+
			
 
				+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
			
 
				+
			
 
				+		/* check the content of the block */
			
 
				+		starpu_data_acquire(block_handle, STARPU_R);
			
 
				+		unsigned i, j, k;
			
 
				+		for (k = 0; k < SIZE; k++)
			
 
				+		for (j = 0; j < SIZE; j++)
			
 
				+		for (i = 0; i < SIZE; i++)
			
 
				+		{
			
 
				+			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
			
 
				+		}
			
 
				+		starpu_data_release(block_handle);
			
 
				+
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		MPI_Status status;
			
 
				+
			
 
				+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
			
 
				+
			
 
				+		/* check the content of the block and modify it */
			
 
				+		ret = starpu_data_acquire(block_handle, STARPU_RW);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
			
 
				+
			
 
				+		unsigned i, j, k;
			
 
				+		for (k = 0; k < SIZE; k++)
			
 
				+		for (j = 0; j < SIZE; j++)
			
 
				+		for (i = 0; i < SIZE; i++)
			
 
				+		{
			
 
				+			assert(block[i + j*SIZE + k*SIZE*SIZE] == 1.0f);
			
 
				+			block[i + j*SIZE + k*SIZE*SIZE] = 33.0f;
			
 
				+		}
			
 
				+		starpu_data_release(block_handle);
			
 
				+
			
 
				+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		starpu_data_unregister(block_handle);
			
 
				+		starpu_free(block);
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stdout, "Rank %d is done\n", rank);
			
 
				+	fflush(stdout);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/cache.c
+++ b/nmad/tests/cache.c
@@ -0,0 +1,110 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+#include <starpu_mpi_cache.h>
			
 
				+
			
 
				+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet_r =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet mycodelet_w =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet mycodelet_rw =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+void test(struct starpu_codelet *codelet, enum starpu_data_access_mode mode, starpu_data_handle_t data, int rank, int in_cache)
			
 
				+{
			
 
				+	void *ptr;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, codelet, mode, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+
			
 
				+	ptr = _starpu_mpi_cache_received_data_get(data);
			
 
				+
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+	     if (in_cache)
			
 
				+	     {
			
 
				+		     STARPU_ASSERT_MSG(ptr != NULL, "Data should be in cache\n");
			
 
				+	     }
			
 
				+	     else
			
 
				+	     {
			
 
				+		     STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
			
 
				+	     }
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, n;
			
 
				+	int ret;
			
 
				+	unsigned val;
			
 
				+	starpu_data_handle_t data;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+
			
 
				+	if (starpu_mpi_cache_is_enabled() == 0) goto skip;
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+		starpu_variable_data_register(&data, 0, (uintptr_t)&val, sizeof(unsigned));
			
 
				+	else
			
 
				+		starpu_variable_data_register(&data, -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+	starpu_mpi_data_register(data, 42, 0);
			
 
				+	FPRINTF_MPI(stderr, "Registering data %p with tag %d and node %d\n", data, 42, 0);
			
 
				+
			
 
				+	// We use the same data with different access modes and we check if it is
			
 
				+	// available or not in the cache
			
 
				+	test(&mycodelet_r, STARPU_R, data, rank, 1);
			
 
				+	test(&mycodelet_rw, STARPU_RW, data, rank, 0);
			
 
				+	test(&mycodelet_r, STARPU_R, data, rank, 1);
			
 
				+	test(&mycodelet_r, STARPU_R, data, rank, 1);
			
 
				+	test(&mycodelet_w, STARPU_W, data, rank, 0);
			
 
				+
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unregister(data);
			
 
				+
			
 
				+skip:
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return starpu_mpi_cache_is_enabled() == 0 ? STARPU_TEST_SKIPPED : 0;
			
 
				+}
			
--- a/nmad/tests/cache_disable.c
+++ b/nmad/tests/cache_disable.c
@@ -0,0 +1,93 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+#include <starpu_mpi_cache.h>
			
 
				+
			
 
				+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet_r =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, n;
			
 
				+	int ret;
			
 
				+	unsigned val;
			
 
				+	starpu_data_handle_t data;
			
 
				+	void *ptr;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+
			
 
				+	if (starpu_mpi_cache_is_enabled() == 0) goto skip;
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+		starpu_variable_data_register(&data, 0, (uintptr_t)&val, sizeof(unsigned));
			
 
				+	else
			
 
				+		starpu_variable_data_register(&data, -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+	starpu_mpi_data_register(data, 42, 0);
			
 
				+	FPRINTF_MPI(stderr, "Registering data %p with tag %d and node %d\n", data, 42, 0);
			
 
				+
			
 
				+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				+
			
 
				+	ptr = _starpu_mpi_cache_received_data_get(data);
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+	     STARPU_ASSERT_MSG(ptr != NULL, "Data should be in cache\n");
			
 
				+	}
			
 
				+
			
 
				+	// We clean the cache
			
 
				+	starpu_mpi_cache_set(0);
			
 
				+
			
 
				+	// We check the data is no longer in the cache
			
 
				+	ptr = _starpu_mpi_cache_received_data_get(data);
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+	     STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				+	ptr = _starpu_mpi_cache_received_data_get(data);
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+	     STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unregister(data);
			
 
				+
			
 
				+skip:
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return starpu_mpi_cache_is_enabled() == 0 ? STARPU_TEST_SKIPPED : 0;
			
 
				+}
			
--- a/nmad/tests/comm.c
+++ b/nmad/tests/comm.c
@@ -0,0 +1,112 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+#include <starpu_mpi_cache.h>
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *value = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	FPRINTF_MPI(stderr, "Executing codelet with value %d\n", *value);
			
 
				+	*value = *value * 2;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int size;
			
 
				+	int color;
			
 
				+	MPI_Comm newcomm;
			
 
				+	int rank, newrank;
			
 
				+	int ret;
			
 
				+	unsigned val = 42;
			
 
				+	starpu_data_handle_t data;
			
 
				+
			
 
				+        MPI_Init(&argc, &argv);
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+        MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+        if (size < 4)
			
 
				+        {
			
 
				+                if (rank == 0)
			
 
				+                        FPRINTF(stderr, "We need at least 4 processes.\n");
			
 
				+
			
 
				+                MPI_Finalize();
			
 
				+                return STARPU_TEST_SKIPPED;
			
 
				+        }
			
 
				+
			
 
				+	color = rank%2;
			
 
				+	MPI_Comm_split(MPI_COMM_WORLD, color, rank, &newcomm);
			
 
				+	MPI_Comm_rank(newcomm, &newrank);
			
 
				+	FPRINTF_MPI(stderr, "[%d] color %d\n", newrank, color);
			
 
				+
			
 
				+        ret = starpu_init(NULL);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+        ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		val = rank+1;
			
 
				+		starpu_variable_data_register(&data, 0, (uintptr_t)&val, sizeof(val));
			
 
				+	}
			
 
				+	else
			
 
				+		starpu_variable_data_register(&data, -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+	starpu_mpi_data_register_comm(data, 42, 0, newcomm);
			
 
				+	FPRINTF_MPI(stderr, "[%d] Registering data %p with tag %d and node %d\n", newrank, data, 42, 0);
			
 
				+
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		FPRINTF_MPI(stderr, "[%d] sending %d\n", newrank, rank);
			
 
				+		MPI_Send(&rank, 1, MPI_INT, 1, 10, newcomm);
			
 
				+		starpu_mpi_send(data, 1, 42, newcomm);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		int x;
			
 
				+		MPI_Recv(&x, 1, MPI_INT, 0, 10, newcomm, NULL);
			
 
				+		FPRINTF_MPI(stderr, "[%d] received %d\n", newrank, x);
			
 
				+		starpu_mpi_recv(data, 0, 42, newcomm, NULL);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_insert_task(newcomm, &mycodelet,
			
 
				+			       STARPU_RW, data,
			
 
				+			       STARPU_EXECUTE_ON_NODE, 1,
			
 
				+			       0);
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unregister(data);
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		FPRINTF_MPI(stderr, "[%d] new value %u\n", newrank, val);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+        MPI_Finalize();
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/datatypes.c
+++ b/nmad/tests/datatypes.c
@@ -0,0 +1,333 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+typedef void (*check_func)(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error);
			
 
				+
			
 
				+void check_void(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
			
 
				+{
			
 
				+	FPRINTF_MPI(stderr, "Success with void value\n");
			
 
				+}
			
 
				+
			
 
				+void check_variable(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
			
 
				+{
			
 
				+	int ret;
			
 
				+	float *v_s, *v_r;
			
 
				+
			
 
				+	STARPU_ASSERT(starpu_variable_get_elemsize(handle_s) == starpu_variable_get_elemsize(handle_r));
			
 
				+
			
 
				+	v_s = (float *)starpu_variable_get_local_ptr(handle_s);
			
 
				+	v_r = (float *)starpu_variable_get_local_ptr(handle_r);
			
 
				+
			
 
				+	if (*v_s == *v_r)
			
 
				+	{
			
 
				+		FPRINTF_MPI(stderr, "Success with variable value: %f == %f\n", *v_s, *v_r);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		*error = 1;
			
 
				+		FPRINTF_MPI(stderr, "Error with variable value: %f != %f\n", *v_s, *v_r);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void check_vector(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
			
 
				+{
			
 
				+	int ret, i;
			
 
				+	int nx;
			
 
				+	int *v_r, *v_s;
			
 
				+
			
 
				+	STARPU_ASSERT(starpu_vector_get_elemsize(handle_s) == starpu_vector_get_elemsize(handle_r));
			
 
				+	STARPU_ASSERT(starpu_vector_get_nx(handle_s) == starpu_vector_get_nx(handle_r));
			
 
				+
			
 
				+	nx = starpu_vector_get_nx(handle_r);
			
 
				+	v_r = (int *)starpu_vector_get_local_ptr(handle_r);
			
 
				+	v_s = (int *)starpu_vector_get_local_ptr(handle_s);
			
 
				+
			
 
				+	for(i=0 ; i<nx ; i++)
			
 
				+	{
			
 
				+		if (v_s[i] == v_r[i])
			
 
				+		{
			
 
				+			FPRINTF_MPI(stderr, "Success with vector[%d] value: %d == %d\n", i, v_s[i], v_r[i]);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			*error = 1;
			
 
				+			FPRINTF_MPI(stderr, "Error with vector[%d] value: %d != %d\n", i, v_s[i], v_r[i]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void check_matrix(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_matrix_get_elemsize(handle_s) == starpu_matrix_get_elemsize(handle_r));
			
 
				+	STARPU_ASSERT(starpu_matrix_get_nx(handle_s) == starpu_matrix_get_nx(handle_r));
			
 
				+	STARPU_ASSERT(starpu_matrix_get_ny(handle_s) == starpu_matrix_get_ny(handle_r));
			
 
				+	STARPU_ASSERT(starpu_matrix_get_local_ld(handle_s) == starpu_matrix_get_local_ld(handle_r));
			
 
				+
			
 
				+	char *matrix_s = (char *)starpu_matrix_get_local_ptr(handle_s);
			
 
				+	char *matrix_r = (char *)starpu_matrix_get_local_ptr(handle_r);
			
 
				+
			
 
				+	int nx = starpu_matrix_get_nx(handle_s);
			
 
				+	int ny = starpu_matrix_get_ny(handle_s);
			
 
				+	int ldy = starpu_matrix_get_local_ld(handle_s);
			
 
				+
			
 
				+	int x, y;
			
 
				+
			
 
				+	for(y=0 ; y<ny ; y++)
			
 
				+		for(x=0 ; x<nx ; x++)
			
 
				+		{
			
 
				+			int index=(y*ldy)+x;
			
 
				+			if (matrix_s[index] == matrix_r[index])
			
 
				+			{
			
 
				+				FPRINTF_MPI(stderr, "Success with matrix[%d,%d --> %d] value: %c == %c\n", x, y, index, matrix_s[index], matrix_r[index]);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				*error = 1;
			
 
				+				FPRINTF_MPI(stderr, "Error with matrix[%d,%d --> %d] value: %c != %c\n", x, y, index, matrix_s[index], matrix_r[index]);
			
 
				+			}
			
 
				+		}
			
 
				+}
			
 
				+
			
 
				+void check_block(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_block_get_elemsize(handle_s) == starpu_block_get_elemsize(handle_r));
			
 
				+	STARPU_ASSERT(starpu_block_get_nx(handle_s) == starpu_block_get_nx(handle_r));
			
 
				+	STARPU_ASSERT(starpu_block_get_ny(handle_s) == starpu_block_get_ny(handle_r));
			
 
				+	STARPU_ASSERT(starpu_block_get_nz(handle_s) == starpu_block_get_nz(handle_r));
			
 
				+	STARPU_ASSERT(starpu_block_get_local_ldy(handle_s) == starpu_block_get_local_ldy(handle_r));
			
 
				+	STARPU_ASSERT(starpu_block_get_local_ldz(handle_s) == starpu_block_get_local_ldz(handle_r));
			
 
				+
			
 
				+	float *block_s = (float *)starpu_block_get_local_ptr(handle_s);
			
 
				+	float *block_r = (float *)starpu_block_get_local_ptr(handle_r);
			
 
				+
			
 
				+	int nx = starpu_block_get_nx(handle_s);
			
 
				+	int ny = starpu_block_get_ny(handle_s);
			
 
				+	int nz = starpu_block_get_nz(handle_s);
			
 
				+
			
 
				+	int ldy = starpu_block_get_local_ldy(handle_s);
			
 
				+	int ldz = starpu_block_get_local_ldz(handle_s);
			
 
				+
			
 
				+	int x, y, z;
			
 
				+
			
 
				+	for(z=0 ; z<nz ; z++)
			
 
				+		for(y=0 ; y<ny ; y++)
			
 
				+			for(x=0 ; x<nx ; x++)
			
 
				+			{
			
 
				+				int index=(z*ldz)+(y*ldy)+x;
			
 
				+				if (block_s[index] == block_r[index])
			
 
				+				{
			
 
				+					FPRINTF_MPI(stderr, "Success with block[%d,%d,%d --> %d] value: %f == %f\n", x, y, z, index, block_s[index], block_r[index]);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					*error = 1;
			
 
				+					FPRINTF_MPI(stderr, "Error with block[%d,%d,%d --> %d] value: %f != %f\n", x, y, z, index, block_s[index], block_r[index]);
			
 
				+				}
			
 
				+			}
			
 
				+}
			
 
				+
			
 
				+void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int tag_s, starpu_data_handle_t handle_r, int tag_r, int *error, check_func func)
			
 
				+{
			
 
				+	int ret;
			
 
				+	MPI_Status status;
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		ret = starpu_mpi_send(handle_s, node, tag_s, MPI_COMM_WORLD);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
			
 
				+		ret = starpu_mpi_recv(handle_r, node, tag_r, MPI_COMM_WORLD, &status);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
			
 
				+
			
 
				+		func(handle_s, handle_r, error);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		ret = starpu_mpi_recv(handle_s, node, tag_s, MPI_COMM_WORLD, &status);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
			
 
				+		ret = starpu_mpi_send(handle_s, node, tag_r, MPI_COMM_WORLD);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+	int error=0;
			
 
				+
			
 
				+	int nx=3;
			
 
				+	int ny=2;
			
 
				+	int nz=4;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		MPI_Status status;
			
 
				+
			
 
				+		{
			
 
				+			starpu_data_handle_t void_handle[2];
			
 
				+			starpu_void_data_register(&void_handle[0]);
			
 
				+			starpu_void_data_register(&void_handle[1]);
			
 
				+
			
 
				+			send_recv_and_check(rank, 1, void_handle[0], 0x42, void_handle[1], 0x1337, &error, check_void);
			
 
				+
			
 
				+			starpu_data_unregister(void_handle[0]);
			
 
				+			starpu_data_unregister(void_handle[1]);
			
 
				+		}
			
 
				+		{
			
 
				+			float v = 42.12;
			
 
				+			starpu_data_handle_t variable_handle[2];
			
 
				+			starpu_variable_data_register(&variable_handle[0], 0, (uintptr_t)&v, sizeof(v));
			
 
				+			starpu_variable_data_register(&variable_handle[1], -1, (uintptr_t)NULL, sizeof(v));
			
 
				+
			
 
				+			send_recv_and_check(rank, 1, variable_handle[0], 0x42, variable_handle[1], 0x1337, &error, check_variable);
			
 
				+
			
 
				+			starpu_data_unregister(variable_handle[0]);
			
 
				+			starpu_data_unregister(variable_handle[1]);
			
 
				+		}
			
 
				+
			
 
				+		{
			
 
				+			int vector[4] = {1, 2, 3, 4};
			
 
				+			starpu_data_handle_t vector_handle[2];
			
 
				+
			
 
				+			starpu_vector_data_register(&vector_handle[0], 0, (uintptr_t)vector, 4, sizeof(vector[0]));
			
 
				+			starpu_vector_data_register(&vector_handle[1], -1, (uintptr_t)NULL, 4, sizeof(vector[0]));
			
 
				+
			
 
				+			send_recv_and_check(rank, 1, vector_handle[0], 0x43, vector_handle[1], 0x2337, &error, check_vector);
			
 
				+
			
 
				+			starpu_data_unregister(vector_handle[0]);
			
 
				+			starpu_data_unregister(vector_handle[1]);
			
 
				+		}
			
 
				+
			
 
				+		{
			
 
				+			char *matrix, n='a';
			
 
				+			int x, y;
			
 
				+			starpu_data_handle_t matrix_handle[2];
			
 
				+
			
 
				+			matrix = (char*)malloc(nx*ny*nz*sizeof(char));
			
 
				+			assert(matrix);
			
 
				+			for(y=0 ; y<ny ; y++)
			
 
				+			{
			
 
				+				for(x=0 ; x<nx ; x++)
			
 
				+				{
			
 
				+					matrix[(y*nx)+x] = n++;
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			starpu_matrix_data_register(&matrix_handle[0], 0, (uintptr_t)matrix, nx, nx, ny, sizeof(char));
			
 
				+			starpu_matrix_data_register(&matrix_handle[1], -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
			
 
				+
			
 
				+			send_recv_and_check(rank, 1, matrix_handle[0], 0x75, matrix_handle[1], 0x8555, &error, check_matrix);
			
 
				+
			
 
				+			starpu_data_unregister(matrix_handle[0]);
			
 
				+			starpu_data_unregister(matrix_handle[1]);
			
 
				+			free(matrix);
			
 
				+		}
			
 
				+
			
 
				+		{
			
 
				+			float *block, n=1.0;
			
 
				+			int x, y, z;
			
 
				+			starpu_data_handle_t block_handle[2];
			
 
				+
			
 
				+			block = (float*)malloc(nx*ny*nz*sizeof(float));
			
 
				+			assert(block);
			
 
				+			for(z=0 ; z<nz ; z++)
			
 
				+			{
			
 
				+				for(y=0 ; y<ny ; y++)
			
 
				+				{
			
 
				+					for(x=0 ; x<nx ; x++)
			
 
				+					{
			
 
				+						block[(z*nx*ny)+(y*nx)+x] = n++;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			starpu_block_data_register(&block_handle[0], 0, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
			
 
				+			starpu_block_data_register(&block_handle[1], -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
			
 
				+
			
 
				+			send_recv_and_check(rank, 1, block_handle[0], 0x73, block_handle[1], 0x8337, &error, check_block);
			
 
				+
			
 
				+			starpu_data_unregister(block_handle[0]);
			
 
				+			starpu_data_unregister(block_handle[1]);
			
 
				+			free(block);
			
 
				+		}
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		MPI_Status status;
			
 
				+
			
 
				+		{
			
 
				+			starpu_data_handle_t void_handle;
			
 
				+			starpu_void_data_register(&void_handle);
			
 
				+			send_recv_and_check(rank, 0, void_handle, 0x42, NULL, 0x1337, NULL, NULL);
			
 
				+			starpu_data_unregister(void_handle);
			
 
				+		}
			
 
				+		{
			
 
				+			starpu_data_handle_t variable_handle;
			
 
				+			starpu_variable_data_register(&variable_handle, -1, (uintptr_t)NULL, sizeof(float));
			
 
				+			send_recv_and_check(rank, 0, variable_handle, 0x42, NULL, 0x1337, NULL, NULL);
			
 
				+			starpu_data_unregister(variable_handle);
			
 
				+		}
			
 
				+
			
 
				+		{
			
 
				+			starpu_data_handle_t vector_handle;
			
 
				+			starpu_vector_data_register(&vector_handle, -1, (uintptr_t)NULL, 4, sizeof(int));
			
 
				+			send_recv_and_check(rank, 0, vector_handle, 0x43, NULL, 0x2337, NULL, NULL);
			
 
				+			starpu_data_unregister(vector_handle);
			
 
				+		}
			
 
				+
			
 
				+		{
			
 
				+			starpu_data_handle_t matrix_handle;
			
 
				+			starpu_matrix_data_register(&matrix_handle, -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
			
 
				+			send_recv_and_check(rank, 0, matrix_handle, 0x75, NULL, 0x8555, NULL, NULL);
			
 
				+			starpu_data_unregister(matrix_handle);
			
 
				+		}
			
 
				+
			
 
				+		{
			
 
				+			starpu_data_handle_t block_handle;
			
 
				+			starpu_block_data_register(&block_handle, -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
			
 
				+			send_recv_and_check(rank, 0, block_handle, 0x73, NULL, 0x8337, NULL, NULL);
			
 
				+			starpu_data_unregister(block_handle);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return rank == 0 ? error : 0;
			
 
				+}
			
--- a/nmad/tests/helper.h
+++ b/nmad/tests/helper.h
@@ -0,0 +1,26 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <errno.h>
			
 
				+
			
 
				+#define STARPU_TEST_SKIPPED 77
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+#define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
			
 
				+    						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
			
 
				+                                                fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				+                                                fflush(ofile); }} while(0);
			
 
				+
			
--- a/nmad/tests/insert_task.c
+++ b/nmad/tests/insert_task.c
@@ -0,0 +1,140 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
			
 
				+	*x = (*x + *y) / 2;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+#define X     4
			
 
				+#define Y     5
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x, int y, int nb_nodes)
			
 
				+{
			
 
				+	return x % nb_nodes;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, size, x, y;
			
 
				+	int value=0, ret;
			
 
				+	unsigned matrix[X][Y];
			
 
				+	starpu_data_handle_t data_handles[X][Y];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			matrix[x][y] = (rank+1)*10 + value;
			
 
				+			value++;
			
 
				+		}
			
 
				+	}
			
 
				+#if 0
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		FPRINTF(stdout, "[%d] ", rank);
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			FPRINTF(stdout, "%3d ", matrix[x][y]);
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, y, size);
			
 
				+			if (mpi_rank == rank)
			
 
				+			{
			
 
				+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
			
 
				+				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* I don't own that index, but will need it for my computations */
			
 
				+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+			}
			
 
				+			if (data_handles[x][y])
			
 
				+			{
			
 
				+				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			if (data_handles[x][y])
			
 
				+				starpu_data_unregister(data_handles[x][y]);
			
 
				+		}
			
 
				+	}
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+#if 0
			
 
				+	for(x = 0; x < X; x++)
			
 
				+	{
			
 
				+		FPRINTF(stdout, "[%d] ", rank);
			
 
				+		for (y = 0; y < Y; y++)
			
 
				+		{
			
 
				+			FPRINTF(stdout, "%3d ", matrix[x][y]);
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/insert_task_block.c
+++ b/nmad/tests/insert_task_block.c
@@ -0,0 +1,162 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	unsigned *matrix = (unsigned *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	int nx = (int)STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+	int ny = (int)STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+	int ld = (int)STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+	int i, j;
			
 
				+	unsigned sum=0;
			
 
				+
			
 
				+	for (i = 0; i < nx; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < ny; j++)
			
 
				+		{
			
 
				+			sum += matrix[i+j*ld];
			
 
				+		}
			
 
				+	}
			
 
				+	for (i = 0; i < nx; i++)
			
 
				+	{
			
 
				+		for (j = 0; j < ny; j++)
			
 
				+		{
			
 
				+			matrix[i+j*ld] = sum;///(nx*ny);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+#define SIZE 6
			
 
				+#define BLOCKS 3
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x, int y, int nb_nodes)
			
 
				+{
			
 
				+	return x % nb_nodes;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, size, x, y;
			
 
				+	int ret, value=0;
			
 
				+	unsigned matrix[SIZE*SIZE];
			
 
				+	starpu_data_handle_t data_handles[SIZE][SIZE];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	for(x = 0; x < SIZE; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < SIZE; y++)
			
 
				+		{
			
 
				+			matrix[x+y*SIZE] = rank*100 + value;
			
 
				+			value++;
			
 
				+		}
			
 
				+	}
			
 
				+#if 1
			
 
				+	for(x = 0; x < SIZE; x++)
			
 
				+	{
			
 
				+		FPRINTF(stdout, "[%d] ", rank);
			
 
				+		for (y = 0; y < SIZE; y++)
			
 
				+		{
			
 
				+			FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	for(x = 0; x < BLOCKS ; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < BLOCKS; y++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, y, size);
			
 
				+			if (mpi_rank == rank)
			
 
				+			{
			
 
				+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
			
 
				+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
			
 
				+							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* I don't own that index, but will need it for my computations */
			
 
				+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
			
 
				+							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
			
 
				+			}
			
 
				+			if (data_handles[x][y])
			
 
				+			{
			
 
				+				starpu_mpi_data_register(data_handles[x][y], (y*BLOCKS)+x, mpi_rank);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for(x = 0; x < BLOCKS; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < BLOCKS; y++)
			
 
				+		{
			
 
				+			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+						     STARPU_RW, data_handles[x][y],
			
 
				+						     0);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(x = 0; x < BLOCKS; x++)
			
 
				+	{
			
 
				+		for (y = 0; y < BLOCKS; y++)
			
 
				+		{
			
 
				+			if (data_handles[x][y])
			
 
				+				starpu_data_unregister(data_handles[x][y]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+#if 1
			
 
				+	for(x = 0; x < SIZE; x++)
			
 
				+	{
			
 
				+		FPRINTF(stdout, "[%d] ", rank);
			
 
				+		for (y = 0; y < SIZE; y++)
			
 
				+		{
			
 
				+			FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
			
 
				+		}
			
 
				+		FPRINTF(stdout, "\n");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/insert_task_cache.c
+++ b/nmad/tests/insert_task_cache.c
@@ -0,0 +1,150 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <common/config.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#if !defined(STARPU_HAVE_SETENV)
			
 
				+#warning setenv is not defined. Skipping test
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
 
				+#else
			
 
				+
			
 
				+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+#define N     1000
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x)
			
 
				+{
			
 
				+	return x;
			
 
				+}
			
 
				+
			
 
				+void test_cache(int rank, int size, char *enabled, size_t *comm_amount)
			
 
				+{
			
 
				+	int i;
			
 
				+	int ret;
			
 
				+	unsigned v[2][N];
			
 
				+	starpu_data_handle_t data_handles[2];
			
 
				+
			
 
				+	setenv("STARPU_MPI_CACHE", enabled, 1);
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	for(i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		int mpi_rank = my_distrib(i);
			
 
				+		if (mpi_rank == rank)
			
 
				+		{
			
 
				+			//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
			
 
				+			starpu_vector_data_register(&data_handles[i], 0, (uintptr_t)&(v[i]), N, sizeof(unsigned));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* I don't own that index, but will need it for my computations */
			
 
				+			//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				+			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
			
 
				+		}
			
 
				+		starpu_mpi_data_register(data_handles[i], i, mpi_rank);
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < 5; i++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < 5; i++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < 5; i++)
			
 
				+	{
			
 
				+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[0]);
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < 5; i++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		starpu_data_unregister(data_handles[i]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_comm_amounts_retrieve(comm_amount);
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int dst, rank, size;
			
 
				+	int result=0;
			
 
				+	size_t *comm_amount_with_cache;
			
 
				+	size_t *comm_amount_without_cache;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	setenv("STARPU_COMM_STATS", "1", 1);
			
 
				+
			
 
				+	comm_amount_with_cache = malloc(size * sizeof(size_t));
			
 
				+	comm_amount_without_cache = malloc(size * sizeof(size_t));
			
 
				+
			
 
				+	test_cache(rank, size, "0", comm_amount_with_cache);
			
 
				+	test_cache(rank, size, "1", comm_amount_without_cache);
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		dst = (rank == 0) ? 1 : 0;
			
 
				+		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
			
 
				+		fprintf(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
			
 
				+	}
			
 
				+	else
			
 
				+		result = 1;
			
 
				+
			
 
				+	free(comm_amount_without_cache);
			
 
				+	free(comm_amount_with_cache);
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+	return !result;
			
 
				+}
			
 
				+#endif
			
--- a/nmad/tests/insert_task_compute.c
+++ b/nmad/tests/insert_task_compute.c
@@ -0,0 +1,142 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], void *_args)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	starpu_codelet_unpack_args(_args, &rank);
			
 
				+
			
 
				+	FPRINTF(stdout, "[%d] VALUES: %u %u\n", rank, *x, *y);
			
 
				+	*x = *x * *y;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+int test(int rank, int node, int *before, int *after, int data_array)
			
 
				+{
			
 
				+	int ok, ret, i, x[2];
			
 
				+	starpu_data_handle_t data_handles[2];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (starpu_cpu_worker_get_count() <= 0)
			
 
				+	{
			
 
				+		// If there is no cpu to execute the codelet, mpi will block trying to do the post-execution communication
			
 
				+		ret = -ENODEV;
			
 
				+		goto nodata;
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "Testing with data_array=%d and node=%d\n", data_array, node);
			
 
				+
			
 
				+	for(i=0 ; i<2 ; i++)
			
 
				+	{
			
 
				+		if (rank <= 1)
			
 
				+		{
			
 
				+			x[i] = before[rank*2+i];
			
 
				+			FPRINTF_MPI(stderr, "before computation x[%d] = %d\n", i, x[i]);
			
 
				+		}
			
 
				+		else
			
 
				+			x[i] = rank*2+i;
			
 
				+		if (rank == i)
			
 
				+			starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(int));
			
 
				+		else
			
 
				+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+		starpu_mpi_data_register(data_handles[i], i, i);
			
 
				+	}
			
 
				+
			
 
				+	switch(data_array)
			
 
				+	{
			
 
				+		case 0:
			
 
				+		{
			
 
				+			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+						     STARPU_RW, data_handles[0], STARPU_R, data_handles[1],
			
 
				+						     STARPU_VALUE, &rank, sizeof(rank),
			
 
				+						     STARPU_EXECUTE_ON_NODE, node, 0);
			
 
				+			break;
			
 
				+		}
			
 
				+		case 1:
			
 
				+		{
			
 
				+			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+						     STARPU_DATA_ARRAY, data_handles, 2,
			
 
				+						     STARPU_VALUE, &rank, sizeof(rank),
			
 
				+						     STARPU_EXECUTE_ON_NODE, node, 0);
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(i=0; i<2; i++)
			
 
				+	{
			
 
				+		starpu_data_unregister(data_handles[i]);
			
 
				+	}
			
 
				+
			
 
				+	ok = 1;
			
 
				+	if (rank <= 1)
			
 
				+	{
			
 
				+		for(i=0; i<2; i++)
			
 
				+		{
			
 
				+			ok = ok && (x[i] == after[rank*2+i]);
			
 
				+			FPRINTF_MPI(stderr, "after computation x[%d] = %d, should be %d\n", i, x[i], after[rank*2+i]);
			
 
				+		}
			
 
				+		FPRINTF_MPI(stderr, "result is %s\n", ok?"CORRECT":"NOT CORRECT");
			
 
				+	}
			
 
				+
			
 
				+nodata:
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return ret == -ENODEV ? ret : !ok;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int ret;
			
 
				+	int before[4] = {10, 20, 11, 22};
			
 
				+	int after_node[2][4] = {{220, 20, 11, 22}, {220, 20, 11, 22}};
			
 
				+	int node, insert_task, data_array;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+
			
 
				+	for(node=0 ; node<=1 ; node++)
			
 
				+	{
			
 
				+		for(data_array=0 ; data_array<=1 ; data_array++)
			
 
				+		{
			
 
				+			ret = test(rank, node, before, after_node[node], data_array);
			
 
				+			if (ret == -ENODEV || ret) goto end;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+end:
			
 
				+	MPI_Finalize();
			
 
				+	return ret==-ENODEV?STARPU_TEST_SKIPPED:ret;
			
 
				+}
			
--- a/nmad/tests/insert_task_count.c
+++ b/nmad/tests/insert_task_count.c
@@ -0,0 +1,116 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	32
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
			
 
				+#endif
			
 
				+
			
 
				+void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	(*tokenptr)++;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {increment_cuda},
			
 
				+#endif
			
 
				+	.cpu_funcs = {increment_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+	int token = 0;
			
 
				+	starpu_data_handle_t token_handle;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (rank == 1)
			
 
				+		starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
			
 
				+	else
			
 
				+		starpu_vector_data_register(&token_handle, -1, (uintptr_t)NULL, 1, sizeof(token));
			
 
				+	starpu_mpi_data_register(token_handle, 12, 1);
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "Start with token value %d\n", token);
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		if (loop % 2)
			
 
				+			starpu_mpi_insert_task(MPI_COMM_WORLD, &increment_cl,
			
 
				+					       STARPU_RW|STARPU_SSEND, token_handle,
			
 
				+					       STARPU_EXECUTE_ON_NODE, 0,
			
 
				+					       0);
			
 
				+		else
			
 
				+			starpu_mpi_insert_task(MPI_COMM_WORLD, &increment_cl,
			
 
				+					       STARPU_RW, token_handle,
			
 
				+					       STARPU_EXECUTE_ON_NODE, 0,
			
 
				+					       0);
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_data_unregister(token_handle);
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "Final value for token %d\n", token);
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(token == nloops, "token==%d != expected_value==%d\n", token, nloops);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(token == 0, "token==%d != expected_value==0\n", token);
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/insert_task_owner.c
+++ b/nmad/tests/insert_task_owner.c
@@ -0,0 +1,169 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int node;
			
 
				+	int rank;
			
 
				+
			
 
				+	starpu_codelet_unpack_args(_args, &node);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	FPRINTF(stderr, "Expected node: %d - Actual node: %d\n", node, rank);
			
 
				+
			
 
				+	assert(node == rank);
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet_r_w =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet mycodelet_rw_r =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet mycodelet_rw_rw =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet mycodelet_w_r =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet mycodelet_r_r =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size, err, node;
			
 
				+	int x0=32, x1=23;
			
 
				+	starpu_data_handle_t data_handlesx0;
			
 
				+	starpu_data_handle_t data_handlesx1;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (rank != 0 && rank != 1) goto end;
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
			
 
				+		starpu_variable_data_register(&data_handlesx1, -1, (uintptr_t)NULL, sizeof(int));
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
			
 
				+		starpu_variable_data_register(&data_handlesx0, -1, (uintptr_t)NULL, sizeof(int));
			
 
				+	}
			
 
				+	starpu_mpi_data_register(data_handlesx0, 0, 0);
			
 
				+	starpu_mpi_data_register(data_handlesx1, 1, 1);
			
 
				+
			
 
				+	node = starpu_mpi_data_get_rank(data_handlesx1);
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
			
 
				+				     STARPU_VALUE, &node, sizeof(node),
			
 
				+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
			
 
				+				     0);
			
 
				+	assert(err == 0);
			
 
				+
			
 
				+	node = starpu_mpi_data_get_rank(data_handlesx0);
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
			
 
				+				     STARPU_VALUE, &node, sizeof(node),
			
 
				+				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
			
 
				+				     0);
			
 
				+	assert(err == 0);
			
 
				+
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				+				     STARPU_VALUE, &node, sizeof(node),
			
 
				+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
			
 
				+				     0);
			
 
				+	assert(err == -EINVAL);
			
 
				+
			
 
				+	node = 1;
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				+				     STARPU_VALUE, &node, sizeof(node),
			
 
				+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				+				     0);
			
 
				+	assert(err == 0);
			
 
				+
			
 
				+	node = 0;
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
			
 
				+				     STARPU_VALUE, &node, sizeof(node),
			
 
				+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				+				     0);
			
 
				+	assert(err == 0);
			
 
				+
			
 
				+	node = 0;
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
			
 
				+				     STARPU_VALUE, &node, sizeof(node),
			
 
				+				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				+				     0);
			
 
				+	assert(err == 0);
			
 
				+
			
 
				+	/* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
			
 
				+	   going to overwrite the node even though the data model clearly specifies
			
 
				+	   which node is going to execute the codelet */
			
 
				+	node = 0;
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
			
 
				+				     STARPU_VALUE, &node, sizeof(node),
			
 
				+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				+				     0);
			
 
				+	assert(err == 0);
			
 
				+
			
 
				+	/* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
			
 
				+	   going to overwrite the node even though the data model clearly specifies
			
 
				+	   which node is going to execute the codelet */
			
 
				+	node = 0;
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
			
 
				+				     STARPU_VALUE, &node, sizeof(node),
			
 
				+				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
			
 
				+				     0);
			
 
				+	assert(err == 0);
			
 
				+
			
 
				+	fprintf(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_data_unregister(data_handlesx0);
			
 
				+	starpu_data_unregister(data_handlesx1);
			
 
				+
			
 
				+end:
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/nmad/tests/insert_task_owner2.c
+++ b/nmad/tests/insert_task_owner2.c
@@ -0,0 +1,126 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	int *x2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
			
 
				+	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[3]);
			
 
				+
			
 
				+	//FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
			
 
				+	//*x2 = 45;
			
 
				+	//*y = 144;
			
 
				+
			
 
				+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d (x2) %d\n", *x0, *x1, *y);
			
 
				+	*y = (*x0 + *x1) * 100;
			
 
				+	*x1 = 12;
			
 
				+	*x2 = 24;
			
 
				+	*x0 = 36;
			
 
				+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 4,
			
 
				+	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, size, err;
			
 
				+	int x[3], y=0;
			
 
				+	int i, ret;
			
 
				+	starpu_data_handle_t data_handles[4];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		for(i=0 ; i<3 ; i++)
			
 
				+		{
			
 
				+			x[i] = 10*(i+1);
			
 
				+			starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
			
 
				+		}
			
 
				+		y = -1;
			
 
				+		starpu_variable_data_register(&data_handles[3], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		for(i=0 ; i<3 ; i++)
			
 
				+		{
			
 
				+			x[i] = -1;
			
 
				+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+		}
			
 
				+		y=200;
			
 
				+		starpu_variable_data_register(&data_handles[3], 0, (uintptr_t)&y, sizeof(int));
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		for(i=0 ; i<4 ; i++)
			
 
				+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+	}
			
 
				+	FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
			
 
				+
			
 
				+	for(i=0 ; i<3 ; i++)
			
 
				+	{
			
 
				+		starpu_mpi_data_register(data_handles[i], i, 0);
			
 
				+	}
			
 
				+	starpu_mpi_data_register(data_handles[3], 3, 1);
			
 
				+
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+				     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
			
 
				+				     STARPU_W, data_handles[2],
			
 
				+				     STARPU_W, data_handles[3],
			
 
				+				     STARPU_EXECUTE_ON_NODE, 1, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	int *values = malloc(4 * sizeof(int));
			
 
				+	for(i=0 ; i<4 ; i++)
			
 
				+	{
			
 
				+		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			starpu_data_acquire(data_handles[i], STARPU_R);
			
 
				+			values[i] = *((int *)starpu_data_get_local_ptr(data_handles[i]));
			
 
				+			starpu_data_release(data_handles[i]);
			
 
				+		}
			
 
				+		starpu_data_unregister(data_handles[i]);
			
 
				+	}
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
			
 
				+	}
			
 
				+        FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
			
 
				+
			
 
				+	free(values);
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/nmad/tests/insert_task_owner_data.c
+++ b/nmad/tests/insert_task_owner_data.c
@@ -0,0 +1,107 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	*x0 += 1;
			
 
				+	*x1 *= *x1;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, size, err;
			
 
				+	int x[2];
			
 
				+	int ret, i;
			
 
				+	starpu_data_handle_t data_handles[2];
			
 
				+	int values[2];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		x[0] = 11;
			
 
				+		starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x[0], sizeof(x[0]));
			
 
				+		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		x[1] = 12;
			
 
				+		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
			
 
				+		starpu_variable_data_register(&data_handles[1], 0, (uintptr_t)&x[1], sizeof(x[1]));
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
			
 
				+		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_data_register(data_handles[0], 0, 0);
			
 
				+	starpu_mpi_data_register(data_handles[1], 1, 1);
			
 
				+
			
 
				+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+				     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
			
 
				+				     STARPU_EXECUTE_ON_DATA, data_handles[1],
			
 
				+				     0);
			
 
				+	assert(err == 0);
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(i=0 ; i<2 ; i++)
			
 
				+	{
			
 
				+		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			starpu_data_acquire(data_handles[i], STARPU_R);
			
 
				+			values[i] = *((int *)starpu_data_get_local_ptr(data_handles[i]));
			
 
				+			starpu_data_release(data_handles[i]);		}
			
 
				+	}
			
 
				+	ret = 0;
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
			
 
				+		if (values[0] != 12 || values[1] != 144)
			
 
				+		{
			
 
				+			ret = EXIT_FAILURE;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(data_handles[0]);
			
 
				+	starpu_data_unregister(data_handles[1]);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
--- a/nmad/tests/insert_task_recv_cache.c
+++ b/nmad/tests/insert_task_recv_cache.c
@@ -0,0 +1,144 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <common/config.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#if !defined(STARPU_HAVE_SETENV)
			
 
				+#warning setenv is not defined. Skipping test
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
 
				+#else
			
 
				+
			
 
				+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+#define N     1000
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x)
			
 
				+{
			
 
				+	return x;
			
 
				+}
			
 
				+
			
 
				+void test_cache(int rank, int size, char *enabled, size_t *comm_amount)
			
 
				+{
			
 
				+	int i;
			
 
				+	int ret;
			
 
				+	unsigned v[2][N];
			
 
				+	starpu_data_handle_t data_handles[2];
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "Testing with STARPU_MPI_CACHE=%s\n", enabled);
			
 
				+	setenv("STARPU_MPI_CACHE", enabled, 1);
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	for(i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		int mpi_rank = my_distrib(i);
			
 
				+		if (mpi_rank == rank)
			
 
				+		{
			
 
				+			starpu_vector_data_register(&data_handles[i], 0, (uintptr_t)&(v[i]), N, sizeof(unsigned));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* I don't own that index, but will need it for my computations */
			
 
				+			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
			
 
				+		}
			
 
				+		starpu_mpi_data_register(data_handles[i], i, mpi_rank);
			
 
				+	}
			
 
				+
			
 
				+	// We call starpu_mpi_insert_task twice, when the cache is enabled, the 1st time puts the
			
 
				+	// data in the cache, the 2nd time allows to check the data is not sent again
			
 
				+	for(i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	// Flush the cache for data_handles[1] which has been sent from node1 to node0
			
 
				+	starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[1]);
			
 
				+
			
 
				+	// Check again
			
 
				+	for(i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		starpu_data_unregister(data_handles[i]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_comm_amounts_retrieve(comm_amount);
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, size;
			
 
				+	int result=0;
			
 
				+	size_t *comm_amount_with_cache;
			
 
				+	size_t *comm_amount_without_cache;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	setenv("STARPU_COMM_STATS", "1", 1);
			
 
				+	setenv("STARPU_MPI_CACHE_STATS", "1", 1);
			
 
				+
			
 
				+	comm_amount_with_cache = malloc(size * sizeof(size_t));
			
 
				+	comm_amount_without_cache = malloc(size * sizeof(size_t));
			
 
				+
			
 
				+	test_cache(rank, size, "0", comm_amount_with_cache);
			
 
				+	test_cache(rank, size, "1", comm_amount_without_cache);
			
 
				+
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+		result = (comm_amount_with_cache[0] == comm_amount_without_cache[0] * 2);
			
 
				+		FPRINTF_MPI(stderr, "Communication cache mechanism is %sworking (with cache: %ld) (without cache: %ld)\n", result?"":"NOT ", comm_amount_with_cache[0], comm_amount_without_cache[0]);
			
 
				+	}
			
 
				+	else
			
 
				+		result = 1;
			
 
				+
			
 
				+	free(comm_amount_without_cache);
			
 
				+	free(comm_amount_with_cache);
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+	return !result;
			
 
				+}
			
 
				+#endif
			
--- a/nmad/tests/insert_task_sent_cache.c
+++ b/nmad/tests/insert_task_sent_cache.c
@@ -0,0 +1,150 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <common/config.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#if !defined(STARPU_HAVE_SETENV)
			
 
				+#warning setenv is not defined. Skipping test
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
 
				+#else
			
 
				+
			
 
				+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+#define N     1000
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x)
			
 
				+{
			
 
				+	return x;
			
 
				+}
			
 
				+
			
 
				+void test_cache(int rank, int size, char *enabled, size_t *comm_amount)
			
 
				+{
			
 
				+	int i;
			
 
				+	int ret;
			
 
				+	unsigned v[2][N];
			
 
				+	starpu_data_handle_t data_handles[2];
			
 
				+
			
 
				+	setenv("STARPU_MPI_CACHE", enabled, 1);
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	for(i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		int mpi_rank = my_distrib(i);
			
 
				+		if (mpi_rank == rank)
			
 
				+		{
			
 
				+			//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
			
 
				+			starpu_vector_data_register(&data_handles[i], 0, (uintptr_t)&(v[i]), N, sizeof(unsigned));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* I don't own that index, but will need it for my computations */
			
 
				+			//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				+			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
			
 
				+		}
			
 
				+		starpu_mpi_data_register(data_handles[i], i, mpi_rank);
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < 5; i++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < 5; i++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < 5; i++)
			
 
				+	{
			
 
				+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[0]);
			
 
				+	}
			
 
				+
			
 
				+	for(i = 0; i < 5; i++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(i = 0; i < 2; i++)
			
 
				+	{
			
 
				+		starpu_data_unregister(data_handles[i]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_comm_amounts_retrieve(comm_amount);
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int dst, rank, size;
			
 
				+	int result=0;
			
 
				+	size_t *comm_amount_with_cache;
			
 
				+	size_t *comm_amount_without_cache;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	setenv("STARPU_COMM_STATS", "1", 1);
			
 
				+
			
 
				+	comm_amount_with_cache = malloc(size * sizeof(size_t));
			
 
				+	comm_amount_without_cache = malloc(size * sizeof(size_t));
			
 
				+
			
 
				+	test_cache(rank, size, "0", comm_amount_with_cache);
			
 
				+	test_cache(rank, size, "1", comm_amount_without_cache);
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		dst = (rank == 0) ? 1 : 0;
			
 
				+		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
			
 
				+		FPRINTF_MPI(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
			
 
				+	}
			
 
				+	else
			
 
				+		result = 1;
			
 
				+
			
 
				+	free(comm_amount_without_cache);
			
 
				+	free(comm_amount_with_cache);
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+	return !result;
			
 
				+}
			
 
				+#endif
			
--- a/nmad/tests/matrix.c
+++ b/nmad/tests/matrix.c
@@ -0,0 +1,132 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	unsigned *A = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned *X = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	unsigned *Y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[2]);
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "VALUES: Y=%3u A=%3u X=%3u\n", *Y, *A, *X);
			
 
				+	*Y = *Y + *A * *X;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+#define N 4
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, n;
			
 
				+	int ret;
			
 
				+	unsigned A[N];
			
 
				+	unsigned X[N];
			
 
				+	unsigned Y;
			
 
				+	starpu_data_handle_t data_A[N];
			
 
				+	starpu_data_handle_t data_X[N];
			
 
				+	starpu_data_handle_t data_Y;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		A[n] = (n+1)*10;
			
 
				+		X[n] = n+1;
			
 
				+	}
			
 
				+	Y = 0;
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "A = ");
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "%u ", A[n]);
			
 
				+	}
			
 
				+	FPRINTF(stderr, "\n");
			
 
				+	FPRINTF_MPI(stderr, "X = ");
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "%u ", X[n]);
			
 
				+	}
			
 
				+	FPRINTF(stderr, "\n");
			
 
				+
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		if (rank == n%2)
			
 
				+			starpu_variable_data_register(&data_A[n], 0, (uintptr_t)&A[n], sizeof(unsigned));
			
 
				+		else
			
 
				+			starpu_variable_data_register(&data_A[n], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+		starpu_mpi_data_register_comm(data_A[n], n+100, n%2, MPI_COMM_WORLD);
			
 
				+		FPRINTF_MPI(stderr, "Registering A[%d] to %p with tag %d and node %d\n", n, data_A[n], n+100, n%2);
			
 
				+
			
 
				+		if (rank == n%2)
			
 
				+			starpu_variable_data_register(&data_X[n], 0, (uintptr_t)&X[n], sizeof(unsigned));
			
 
				+		else
			
 
				+			starpu_variable_data_register(&data_X[n], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+		starpu_mpi_data_register_comm(data_X[n], n+200, n%2, MPI_COMM_WORLD);
			
 
				+		FPRINTF_MPI(stderr, "Registering X[%d] to %p with tag %d and node %d\n", n, data_X[n], n+200, n%2);
			
 
				+	}
			
 
				+	if (rank == 0)
			
 
				+		starpu_variable_data_register(&data_Y, 0, (uintptr_t)&Y, sizeof(unsigned));
			
 
				+	else
			
 
				+		starpu_variable_data_register(&data_Y, -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+	starpu_mpi_data_register_comm(data_Y, 10, 0, MPI_COMM_WORLD);
			
 
				+	FPRINTF_MPI(stderr, "Registering Y to %p with tag %d and node %d\n", data_Y, 10, 0);
			
 
				+
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+					     STARPU_R, data_A[n],
			
 
				+					     STARPU_R, data_X[n],
			
 
				+					     STARPU_RW, data_Y,
			
 
				+					     STARPU_EXECUTE_ON_DATA, data_A[n],
			
 
				+					     0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		starpu_data_unregister(data_A[n]);
			
 
				+		starpu_data_unregister(data_X[n]);
			
 
				+	}
			
 
				+	starpu_data_unregister(data_Y);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	FPRINTF(stdout, "[%d] Y=%u\n", rank, Y);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(Y==300, "Error when calculating Y=%u\n", Y);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/matrix2.c
+++ b/nmad/tests/matrix2.c
@@ -0,0 +1,140 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	unsigned *A = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned *X = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	unsigned *Y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[2]);
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "VALUES: Y=%3u A=%3u X=%3u\n", *Y, *A, *X);
			
 
				+	*Y = *Y + *A * *X;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+#define N 4
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, size;
			
 
				+	int n;
			
 
				+	int ret;
			
 
				+	unsigned A[N];
			
 
				+	unsigned X[N];
			
 
				+	starpu_data_handle_t data_A[N];
			
 
				+	starpu_data_handle_t data_X[N];
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 3)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 3 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		A[n] = (n+1)*10;
			
 
				+		X[n] = n+1;
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF_MPI(stderr, "A = ");
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "%u ", A[n]);
			
 
				+	}
			
 
				+	FPRINTF(stderr, "\n");
			
 
				+	FPRINTF_MPI(stderr, "X = ");
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "%u ", X[n]);
			
 
				+	}
			
 
				+	FPRINTF(stderr, "\n");
			
 
				+
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		if (rank == n%2)
			
 
				+			starpu_variable_data_register(&data_A[n], 0, (uintptr_t)&A[n], sizeof(unsigned));
			
 
				+		else
			
 
				+			starpu_variable_data_register(&data_A[n], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+		starpu_mpi_data_register(data_A[n], n+100, n%2);
			
 
				+		FPRINTF_MPI(stderr, "Registering A[%d] to %p with tag %d and node %d\n", n,data_A[n], n+100, n%2);
			
 
				+	}
			
 
				+
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		if (rank == 2)
			
 
				+			starpu_variable_data_register(&data_X[n], 0, (uintptr_t)&X[n], sizeof(unsigned));
			
 
				+		else
			
 
				+			starpu_variable_data_register(&data_X[n], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				+		starpu_mpi_data_register(data_X[n], n+200, 2);
			
 
				+		FPRINTF_MPI(stderr, "Registering X[%d] to %p with tag %d and node %d\n", n, data_X[n], n+200, 2);
			
 
				+	}
			
 
				+
			
 
				+	for(n = 0; n < N-1; n++)
			
 
				+	{
			
 
				+	     fprintf(stderr, "loop %d\n", n);
			
 
				+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
			
 
				+					     STARPU_R, data_A[n],
			
 
				+					     STARPU_R, data_X[n],
			
 
				+					     STARPU_RW, data_X[N-1],
			
 
				+					     STARPU_EXECUTE_ON_DATA, data_A[n],
			
 
				+					     0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+	}
			
 
				+
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(n = 0; n < N; n++)
			
 
				+	{
			
 
				+		starpu_data_unregister(data_A[n]);
			
 
				+		starpu_data_unregister(data_X[n]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	FPRINTF(stdout, "[%d] X[%d]=%u\n", rank, N-1, X[N-1]);
			
 
				+
			
 
				+	if (rank == 2)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(X[N-1]==144, "Error when calculating X[N-1]=%u\n", X[N-1]);
			
 
				+	}
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/mpi_detached_tag.c
+++ b/nmad/tests/mpi_detached_tag.c
@@ -0,0 +1,86 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	16
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+#define SIZE	16
			
 
				+
			
 
				+float *tab;
			
 
				+starpu_data_handle_t tab_handle;
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	tab = malloc(SIZE*sizeof(float));
			
 
				+
			
 
				+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		starpu_tag_t tag = (starpu_tag_t)loop;
			
 
				+
			
 
				+		if ((loop % 2) == (rank%2))
			
 
				+		{
			
 
				+			starpu_mpi_isend_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_mpi_irecv_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
			
 
				+		}
			
 
				+
			
 
				+		starpu_tag_wait(tag);
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/mpi_irecv.c
+++ b/nmad/tests/mpi_irecv.c
@@ -0,0 +1,85 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	16
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+#define SIZE	16
			
 
				+
			
 
				+float *tab;
			
 
				+starpu_data_handle_t tab_handle;
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	tab = malloc(SIZE*sizeof(float));
			
 
				+
			
 
				+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		if ((loop % 2) == (rank%2))
			
 
				+		{
			
 
				+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			MPI_Status status;
			
 
				+			starpu_mpi_req req;
			
 
				+			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
			
 
				+			starpu_mpi_wait(&req, &status);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/mpi_irecv_detached.c
+++ b/nmad/tests/mpi_irecv_detached.c
@@ -0,0 +1,103 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <common/thread.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	16
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+#define SIZE	16
			
 
				+
			
 
				+float *tab;
			
 
				+starpu_data_handle_t tab_handle;
			
 
				+
			
 
				+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
			
 
				+
			
 
				+void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	unsigned *received = arg;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+	*received = 1;
			
 
				+	STARPU_PTHREAD_COND_SIGNAL(&cond);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	tab = malloc(SIZE*sizeof(float));
			
 
				+
			
 
				+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		if ((loop % 2) == (rank%2))
			
 
				+		{
			
 
				+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			int received = 0;
			
 
				+			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
			
 
				+
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+			while (!received)
			
 
				+				STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/mpi_isend.c
+++ b/nmad/tests/mpi_isend.c
@@ -0,0 +1,86 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	16
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+#define SIZE	16
			
 
				+
			
 
				+float *tab;
			
 
				+starpu_data_handle_t tab_handle;
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	tab = malloc(SIZE*sizeof(float));
			
 
				+
			
 
				+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		if ((loop % 2) == (rank%2))
			
 
				+		{
			
 
				+			MPI_Status status;
			
 
				+			starpu_mpi_req req;
			
 
				+			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
			
 
				+			starpu_mpi_wait(&req, &status);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			MPI_Status status;
			
 
				+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/mpi_isend_detached.c
+++ b/nmad/tests/mpi_isend_detached.c
@@ -0,0 +1,108 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <common/thread.h>
			
 
				+
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	16
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+#define SIZE	16
			
 
				+
			
 
				+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
			
 
				+
			
 
				+void callback(void *arg)
			
 
				+{
			
 
				+	unsigned *completed = arg;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+	*completed = 1;
			
 
				+	STARPU_PTHREAD_COND_SIGNAL(&cond);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+	float *tab;
			
 
				+	starpu_data_handle_t tab_handle;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	tab = malloc(SIZE*sizeof(float));
			
 
				+
			
 
				+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		if ((loop % 2) == (rank%2))
			
 
				+		{
			
 
				+			int sent = 0;
			
 
				+			starpu_mpi_isend_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &sent);
			
 
				+
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+			while (!sent)
			
 
				+				STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			int received = 0;
			
 
				+			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
			
 
				+
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+			while (!received)
			
 
				+				STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/mpi_reduction.c
+++ b/nmad/tests/mpi_reduction.c
@@ -0,0 +1,173 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2012, 2013, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+extern void init_cpu_func(void *descr[], void *cl_arg);
			
 
				+extern void redux_cpu_func(void *descr[], void *cl_arg);
			
 
				+extern void dot_cpu_func(void *descr[], void *cl_arg);
			
 
				+extern void display_cpu_func(void *descr[], void *cl_arg);
			
 
				+
			
 
				+static struct starpu_codelet init_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {init_cpu_func},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W},
			
 
				+	.name = "init_codelet"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet redux_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {redux_cpu_func},
			
 
				+	.modes = {STARPU_RW, STARPU_R},
			
 
				+	.nbuffers = 2,
			
 
				+	.name = "redux_codelet"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet dot_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {dot_cpu_func},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_REDUX},
			
 
				+	.name = "dot_codelet"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet display_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {display_cpu_func},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R},
			
 
				+	.name = "display_codelet"
			
 
				+};
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x, int nb_nodes)
			
 
				+{
			
 
				+	return x % nb_nodes;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int my_rank, size, x, y, i;
			
 
				+	long int *vector;
			
 
				+	long int dot, sum=0;
			
 
				+	starpu_data_handle_t *handles;
			
 
				+	starpu_data_handle_t dot_handle;
			
 
				+
			
 
				+	int nb_elements, step, loops;
			
 
				+
			
 
				+	int ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	nb_elements = size*8000;
			
 
				+	step = 4;
			
 
				+	loops = 5;
			
 
				+
			
 
				+	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
			
 
				+	for(x = 0; x < nb_elements; x+=step)
			
 
				+	{
			
 
				+		int mpi_rank = my_distrib(x/step, size);
			
 
				+		if (mpi_rank == my_rank)
			
 
				+		{
			
 
				+			for(y=0 ; y<step ; y++)
			
 
				+			{
			
 
				+				vector[x+y] = x+y+1;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (my_rank == 0)
			
 
				+	{
			
 
				+		dot = 14;
			
 
				+		sum = (nb_elements * (nb_elements + 1)) / 2;
			
 
				+		sum *= loops;
			
 
				+		sum += dot;
			
 
				+		starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(dot));
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starpu_variable_data_register(&dot_handle, -1, (uintptr_t)NULL, sizeof(dot));
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
			
 
				+	for(x = 0; x < nb_elements; x+=step)
			
 
				+	{
			
 
				+		int mpi_rank = my_distrib(x/step, size);
			
 
				+		if (mpi_rank == my_rank)
			
 
				+		{
			
 
				+			/* Owning data */
			
 
				+			starpu_vector_data_register(&handles[x], 0, (uintptr_t)&(vector[x]), step, sizeof(vector[0]));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_vector_data_register(&handles[x], -1, (uintptr_t)NULL, step, sizeof(vector[0]));
			
 
				+		}
			
 
				+		if (handles[x])
			
 
				+		{
			
 
				+			starpu_mpi_data_register(handles[x], x, mpi_rank);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_data_register(dot_handle, nb_elements+1, 0);
			
 
				+	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
			
 
				+
			
 
				+	for (i = 0; i < loops; i++)
			
 
				+	{
			
 
				+		for (x = 0; x < nb_elements; x+=step)
			
 
				+		{
			
 
				+			starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				+					       &dot_codelet,
			
 
				+					       STARPU_R, handles[x],
			
 
				+					       STARPU_REDUX, dot_handle,
			
 
				+					       0);
			
 
				+		}
			
 
				+		starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
			
 
				+		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
			
 
				+	}
			
 
				+
			
 
				+	fprintf(stderr, "Waiting ...\n");
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for(x = 0; x < nb_elements; x+=step)
			
 
				+	{
			
 
				+		if (handles[x]) starpu_data_unregister(handles[x]);
			
 
				+	}
			
 
				+	if (dot_handle)
			
 
				+	{
			
 
				+		starpu_data_unregister(dot_handle);
			
 
				+	}
			
 
				+	free(vector);
			
 
				+	free(handles);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (my_rank == 0)
			
 
				+	{
			
 
				+		fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
			
 
				+		fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
			
 
				+		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/nmad/tests/mpi_reduction_kernels.c
+++ b/nmad/tests/mpi_reduction_kernels.c
@@ -0,0 +1,76 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#define _DISPLAY(fmt, ...) do { \
			
 
				+		int _display_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_display_rank);	\
			
 
				+		fprintf(stderr, "[%d][%s] " fmt , _display_rank, __starpu_func__ ,## __VA_ARGS__); 	\
			
 
				+		fflush(stderr); } while(0)
			
 
				+
			
 
				+/*
			
 
				+ *	Codelet to create a neutral element
			
 
				+ */
			
 
				+void init_cpu_func(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	*dot = 0;
			
 
				+	_DISPLAY("Init dot\n");
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Codelet to perform the reduction of two elements
			
 
				+ */
			
 
				+void redux_cpu_func(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	long int *dota = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	long int *dotb = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	*dota = *dota + *dotb;
			
 
				+	_DISPLAY("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Dot product codelet
			
 
				+ */
			
 
				+void dot_cpu_func(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+
			
 
				+	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+//	_DISPLAY("Before dot=%ld (adding %d elements...)\n", *dot, n);
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+//		_DISPLAY("Adding %ld\n", local_x[i]);
			
 
				+		*dot += local_x[i];
			
 
				+	}
			
 
				+//	_DISPLAY("After dot=%ld\n", *dot);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Display codelet
			
 
				+ */
			
 
				+void display_cpu_func(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+
			
 
				+	_DISPLAY("Local=%ld\n", *local_x);
			
 
				+}
			
 
				+
			
--- a/nmad/tests/mpi_redux.c
+++ b/nmad/tests/mpi_redux.c
@@ -0,0 +1,94 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
			
 
				+
			
 
				+void callback(void *arg)
			
 
				+{
			
 
				+	unsigned *received = arg;
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+	*received = *received + 1;
			
 
				+	fprintf(stderr, "received = %d\n", *received);
			
 
				+	STARPU_PTHREAD_COND_SIGNAL(&cond);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+	int value=0;
			
 
				+	starpu_data_handle_t *handles;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		int src, sum;
			
 
				+		int received = 1;
			
 
				+
			
 
				+		handles = malloc(size * sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+		for(src=1 ; src<size ; src++)
			
 
				+		{
			
 
				+			starpu_variable_data_register(&handles[src], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+			starpu_mpi_irecv_detached(handles[src], src, 12, MPI_COMM_WORLD, callback, &received);
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				+		while (received != size)
			
 
				+			STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+
			
 
				+		for(src=1 ; src<size ; src++)
			
 
				+		{
			
 
				+			void *ptr = starpu_data_get_local_ptr(handles[src]);
			
 
				+			value += *((int *)ptr);
			
 
				+			starpu_data_unregister(handles[src]);
			
 
				+		}
			
 
				+		sum = ((size-1) * (size) / 2);
			
 
				+		STARPU_ASSERT_MSG(sum == value, "Sum of first %d integers is %d, not %d\n", size-1, sum, value);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		value = rank;
			
 
				+		handles = malloc(sizeof(starpu_data_handle_t));
			
 
				+		starpu_variable_data_register(&handles[0], 0, (uintptr_t)&value, sizeof(int));
			
 
				+		starpu_mpi_send(handles[0], 0, 12, MPI_COMM_WORLD);
			
 
				+		starpu_data_unregister_submit(handles[0]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	free(handles);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/mpi_scatter_gather.c
+++ b/nmad/tests/mpi_scatter_gather.c
@@ -0,0 +1,191 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+/* Returns the MPI node number where data indexes index is */
			
 
				+int my_distrib(int x, int nb_nodes)
			
 
				+{
			
 
				+	return x % nb_nodes;
			
 
				+}
			
 
				+
			
 
				+void cpu_codelet(void *descr[], void *_args)
			
 
				+{
			
 
				+	int *vector = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+	unsigned i;
			
 
				+	int rank;
			
 
				+
			
 
				+	starpu_codelet_unpack_args(_args, &rank);
			
 
				+	for (i = 0; i < nx; i++)
			
 
				+	{
			
 
				+		//fprintf(stderr,"rank %d v[%d] = %d\n", rank, i, vector[i]);
			
 
				+		vector[i] *= rank+2;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_codelet},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				+};
			
 
				+
			
 
				+void scallback(void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	char *msg = arg;
			
 
				+	FPRINTF_MPI(stderr, "Sending completed for <%s>\n", msg);
			
 
				+}
			
 
				+
			
 
				+void rcallback(void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	char *msg = arg;
			
 
				+	FPRINTF_MPI(stderr, "Reception completed for <%s>\n", msg);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, nodes, ret, x;
			
 
				+	int *vector = NULL;
			
 
				+	starpu_data_handle_t *data_handles;
			
 
				+	int size=10;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		/* Allocate the vector */
			
 
				+		vector = malloc(size * sizeof(int));
			
 
				+		for(x=0 ; x<size ; x++)
			
 
				+		{
			
 
				+			vector[x] = x+10;
			
 
				+		}
			
 
				+
			
 
				+		// Print vector
			
 
				+		FPRINTF_MPI(stderr, " Input vector: ");
			
 
				+		for(x=0 ; x<size ; x++)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "%d\t", vector[x]);
			
 
				+		}
			
 
				+		FPRINTF(stderr,"\n");
			
 
				+	}
			
 
				+
			
 
				+	/* Allocate data handles and register data to StarPU */
			
 
				+	data_handles = (starpu_data_handle_t *) calloc(size, sizeof(starpu_data_handle_t));
			
 
				+	for(x = 0; x < size ; x++)
			
 
				+	{
			
 
				+		int mpi_rank = my_distrib(x, nodes);
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			starpu_vector_data_register(&data_handles[x], 0, (uintptr_t)&vector[x], 1, sizeof(int));
			
 
				+		}
			
 
				+		else if ((mpi_rank == rank))
			
 
				+		{
			
 
				+			/* I do not own that index but i will need it for my computations */
			
 
				+			starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL, 1, sizeof(int));
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* I know it's useless to allocate anything for this */
			
 
				+			data_handles[x] = NULL;
			
 
				+		}
			
 
				+		if (data_handles[x])
			
 
				+		{
			
 
				+			starpu_mpi_data_register(data_handles[x], x, 0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Scatter the matrix among the nodes */
			
 
				+	for(x = 0; x < size ; x++)
			
 
				+	{
			
 
				+		if (data_handles[x])
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, nodes);
			
 
				+			starpu_mpi_data_set_rank(data_handles[x], mpi_rank);
			
 
				+		}
			
 
				+	}
			
 
				+	starpu_mpi_scatter_detached(data_handles, size, 0, MPI_COMM_WORLD, scallback, "scatter", NULL, NULL);
			
 
				+
			
 
				+	/* Calculation */
			
 
				+	for(x = 0; x < size ; x++)
			
 
				+	{
			
 
				+		if (data_handles[x])
			
 
				+		{
			
 
				+			int owner = starpu_mpi_data_get_rank(data_handles[x]);
			
 
				+			if (owner == rank)
			
 
				+			{
			
 
				+				FPRINTF_MPI(stderr,"Computing on data[%d]\n", x);
			
 
				+				starpu_insert_task(&cl,
			
 
				+						   STARPU_VALUE, &rank, sizeof(rank),
			
 
				+						   STARPU_RW, data_handles[x],
			
 
				+						   0);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Gather the matrix on main node */
			
 
				+	starpu_mpi_gather_detached(data_handles, size, 0, MPI_COMM_WORLD, scallback, "gather", rcallback, "gather");
			
 
				+	for(x = 0; x < size ; x++)
			
 
				+	{
			
 
				+		if (data_handles[x])
			
 
				+		{
			
 
				+			starpu_mpi_data_set_rank(data_handles[x], 0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Unregister matrix from StarPU */
			
 
				+	for(x=0 ; x<size ; x++)
			
 
				+	{
			
 
				+		if (data_handles[x])
			
 
				+		{
			
 
				+			starpu_data_unregister(data_handles[x]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Print vector
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		FPRINTF_MPI(stderr, "Output vector: ");
			
 
				+		for(x=0 ; x<size ; x++)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "%d\t", vector[x]);
			
 
				+		}
			
 
				+		FPRINTF(stderr,"\n");
			
 
				+		for(x=0 ; x<size ; x++)
			
 
				+		{
			
 
				+			int mpi_rank = my_distrib(x, nodes);
			
 
				+			if (vector[x] != (x+10) * (mpi_rank+2))
			
 
				+			{
			
 
				+				FPRINTF_MPI(stderr, "Incorrect value for vector[%d]. computed %d != expected %d\n", x, vector[x], (x+10) * (mpi_rank+2));
			
 
				+				ret = 1;
			
 
				+			}
			
 
				+		}
			
 
				+		free(vector);
			
 
				+	}
			
 
				+
			
 
				+	// Free memory
			
 
				+	free(data_handles);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+	return (rank == 0) ? ret : 0;
			
 
				+}
			
--- a/nmad/tests/mpi_test.c
+++ b/nmad/tests/mpi_test.c
@@ -0,0 +1,93 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	16
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+
			
 
				+#define SIZE	16
			
 
				+
			
 
				+float *tab;
			
 
				+starpu_data_handle_t tab_handle;
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	tab = malloc(SIZE*sizeof(float));
			
 
				+
			
 
				+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		starpu_mpi_req req;
			
 
				+
			
 
				+		if ((loop % 2) == (rank%2))
			
 
				+		{
			
 
				+			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+
			
 
				+		int finished = 0;
			
 
				+		do
			
 
				+		{
			
 
				+			MPI_Status status;
			
 
				+			starpu_mpi_test(&req, &finished, &status);
			
 
				+		}
			
 
				+		while (!finished);
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/multiple_send.c
+++ b/nmad/tests/multiple_send.c
@@ -0,0 +1,97 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+	unsigned send[2] = {42, 11};
			
 
				+	unsigned recv[2] = {33, 33};
			
 
				+	starpu_mpi_req req[2];
			
 
				+	starpu_data_handle_t send_handle[2];
			
 
				+	starpu_data_handle_t recv_handle[2];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 2 processes.\n");
			
 
				+
			
 
				+		starpu_mpi_shutdown();
			
 
				+		starpu_shutdown();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	starpu_variable_data_register(&send_handle[0], 0, (uintptr_t)&send[0], sizeof(unsigned));
			
 
				+	starpu_variable_data_register(&send_handle[1], 0, (uintptr_t)&send[1], sizeof(unsigned));
			
 
				+	starpu_variable_data_register(&recv_handle[0], 0, (uintptr_t)&recv[0], sizeof(unsigned));
			
 
				+	starpu_variable_data_register(&recv_handle[1], 0, (uintptr_t)&recv[1], sizeof(unsigned));
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		int nb_req=2;
			
 
				+		while (nb_req)
			
 
				+		{
			
 
				+			int r=0;
			
 
				+			for(r=0 ; r<2 ; r++)
			
 
				+			{
			
 
				+				if (req[r])
			
 
				+				{
			
 
				+					int finished = 0;
			
 
				+					MPI_Status status;
			
 
				+					starpu_mpi_test(&req[r], &finished, &status);
			
 
				+					STARPU_ASSERT(finished != -1);
			
 
				+					if (finished)
			
 
				+					{
			
 
				+						FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
			
 
				+						req[r] = NULL;
			
 
				+						nb_req--;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	FPRINTF(stderr, "[%d] All requests finished\n", rank);
			
 
				+
			
 
				+	starpu_data_unregister(send_handle[0]);
			
 
				+	starpu_data_unregister(send_handle[1]);
			
 
				+	starpu_data_unregister(recv_handle[0]);
			
 
				+	starpu_data_unregister(recv_handle[1]);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/pingpong.c
+++ b/nmad/tests/pingpong.c
@@ -0,0 +1,85 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	16
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+
			
 
				+#define SIZE	16
			
 
				+
			
 
				+float *tab;
			
 
				+starpu_data_handle_t tab_handle;
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	tab = malloc(SIZE*sizeof(float));
			
 
				+
			
 
				+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		if ((loop % 2) == (rank%2))
			
 
				+		{
			
 
				+			//FPRINTF_MPI(stderr, "Sending to %d\n", other_rank);
			
 
				+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			MPI_Status status;
			
 
				+			//FPRINTF_MPI(stderr, "Receiving from %d\n", other_rank);
			
 
				+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				+	free(tab);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/ring.c
+++ b/nmad/tests/ring.c
@@ -0,0 +1,133 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	32
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+
			
 
				+int token = 42;
			
 
				+starpu_data_handle_t token_handle;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
			
 
				+#endif
			
 
				+
			
 
				+void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	(*tokenptr)++;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {increment_cuda},
			
 
				+#endif
			
 
				+	.cpu_funcs = {increment_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+void increment_token(void)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+	task->cl = &increment_cl;
			
 
				+	task->handles[0] = token_handle;
			
 
				+	task->synchronous = 1;
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+
			
 
				+	int last_loop = nloops - 1;
			
 
				+	int last_rank = size - 1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		int tag = loop*size + rank;
			
 
				+
			
 
				+		if (loop == 0 && rank == 0)
			
 
				+		{
			
 
				+			token = 0;
			
 
				+			FPRINTF(stdout, "Start with token value %u\n", token);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			MPI_Status status;
			
 
				+			starpu_mpi_recv(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, &status);
			
 
				+		}
			
 
				+
			
 
				+		increment_token();
			
 
				+
			
 
				+		if (loop == last_loop && rank == last_rank)
			
 
				+		{
			
 
				+			starpu_data_acquire(token_handle, STARPU_R);
			
 
				+			FPRINTF(stdout, "Finished : token value %u\n", token);
			
 
				+			starpu_data_release(token_handle);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_mpi_send(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(token_handle);
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	if (rank == last_rank)
			
 
				+	{
			
 
				+		STARPU_ASSERT(token == nloops*size);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/ring_async.c
+++ b/nmad/tests/ring_async.c
@@ -0,0 +1,137 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	32
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+
			
 
				+int token = 42;
			
 
				+starpu_data_handle_t token_handle;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
			
 
				+#endif
			
 
				+
			
 
				+void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	(*tokenptr)++;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {increment_cuda},
			
 
				+#endif
			
 
				+	.cpu_funcs = {increment_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+void increment_token(void)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+	task->cl = &increment_cl;
			
 
				+	task->handles[0] = token_handle;
			
 
				+	task->synchronous = 1;
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	MPI_Init(&argc, &argv);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+
			
 
				+	int last_loop = nloops - 1;
			
 
				+	int last_rank = size - 1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		int tag = loop*size + rank;
			
 
				+
			
 
				+		if (loop == 0 && rank == 0)
			
 
				+		{
			
 
				+			token = 0;
			
 
				+			FPRINTF(stdout, "Start with token value %u\n", token);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			MPI_Status status;
			
 
				+			starpu_mpi_req req;
			
 
				+			starpu_mpi_irecv(token_handle, &req, (rank+size-1)%size, tag, MPI_COMM_WORLD);
			
 
				+			starpu_mpi_wait(&req, &status);
			
 
				+		}
			
 
				+
			
 
				+		increment_token();
			
 
				+
			
 
				+		if (loop == last_loop && rank == last_rank)
			
 
				+		{
			
 
				+			starpu_data_acquire(token_handle, STARPU_R);
			
 
				+			FPRINTF(stdout, "Finished : token value %u\n", token);
			
 
				+			starpu_data_release(token_handle);
			
 
				+		}
			
 
				+		else {
			
 
				+			starpu_mpi_req req;
			
 
				+			MPI_Status status;
			
 
				+			starpu_mpi_isend(token_handle, &req, (rank+1)%size, tag+1, MPI_COMM_WORLD);
			
 
				+			starpu_mpi_wait(&req, &status);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(token_handle);
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	if (rank == last_rank)
			
 
				+	{
			
 
				+		STARPU_ASSERT(token == nloops*size);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/ring_async_implicit.c
+++ b/nmad/tests/ring_async_implicit.c
@@ -0,0 +1,131 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#  define NITER	32
			
 
				+#else
			
 
				+#  define NITER	2048
			
 
				+#endif
			
 
				+
			
 
				+int token = 42;
			
 
				+starpu_data_handle_t token_handle;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
			
 
				+#endif
			
 
				+
			
 
				+void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	(*tokenptr)++;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet increment_cl =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {increment_cuda},
			
 
				+#endif
			
 
				+	.cpu_funcs = {increment_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+void increment_token(void)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+	task->cl = &increment_cl;
			
 
				+	task->handles[0] = token_handle;
			
 
				+
			
 
				+	int ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, rank, size;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (size < 2)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need at least 2 processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
			
 
				+
			
 
				+	int nloops = NITER;
			
 
				+	int loop;
			
 
				+
			
 
				+	int last_loop = nloops - 1;
			
 
				+	int last_rank = size - 1;
			
 
				+
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		int tag = loop*size + rank;
			
 
				+
			
 
				+		if (loop == 0 && rank == 0)
			
 
				+		{
			
 
				+			token = 0;
			
 
				+			FPRINTF(stdout, "Start with token value %u\n", token);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, NULL, NULL);
			
 
				+		}
			
 
				+
			
 
				+		increment_token();
			
 
				+
			
 
				+		if (loop == last_loop && rank == last_rank)
			
 
				+		{
			
 
				+			starpu_data_acquire(token_handle, STARPU_R);
			
 
				+			FPRINTF(stdout, "Finished : token value %u\n", token);
			
 
				+			starpu_data_release(token_handle);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unregister(token_handle);
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (rank == last_rank)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
			
 
				+		STARPU_ASSERT(token == nloops*size);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/tests/ring_kernel.cu
+++ b/nmad/tests/ring_kernel.cu