Преглед на файлове

nmad/: Port starpu-mpi-1.1 into a new folder nmad and build it with --enable-nmad
Add functions from starpu-mpi 1.3 (task_insert, select_node) to match the interface.

Guillaume Beauchamp преди 8 години
родител
ревизия
c263463c3c
променени са 100 файла, в които са добавени 13119 реда и са изтрити 1 реда
  1. 4 0
      Makefile.am
  2. 161 1
      configure.ac
  3. 5 0
      examples/stencil/Makefile.am
  4. 1 0
      nmad/.gitignore
  5. 31 0
      nmad/Makefile.am
  6. 193 0
      nmad/examples/Makefile.am
  7. 102 0
      nmad/examples/complex/mpi_complex.c
  8. 72 0
      nmad/examples/matrix_decomposition/mpi_cholesky.c
  9. 260 0
      nmad/examples/matrix_decomposition/mpi_cholesky_codelets.c
  10. 30 0
      nmad/examples/matrix_decomposition/mpi_cholesky_codelets.h
  11. 64 0
      nmad/examples/matrix_decomposition/mpi_cholesky_distributed.c
  12. 247 0
      nmad/examples/matrix_decomposition/mpi_cholesky_kernels.c
  13. 33 0
      nmad/examples/matrix_decomposition/mpi_cholesky_kernels.h
  14. 40 0
      nmad/examples/matrix_decomposition/mpi_cholesky_models.c
  15. 27 0
      nmad/examples/matrix_decomposition/mpi_cholesky_models.h
  16. 110 0
      nmad/examples/matrix_decomposition/mpi_decomposition_matrix.c
  17. 30 0
      nmad/examples/matrix_decomposition/mpi_decomposition_matrix.h
  18. 100 0
      nmad/examples/matrix_decomposition/mpi_decomposition_params.c
  19. 34 0
      nmad/examples/matrix_decomposition/mpi_decomposition_params.h
  20. 42 0
      nmad/examples/mpi_lu/mpi_lu-double.h
  21. 42 0
      nmad/examples/mpi_lu/mpi_lu-float.h
  22. 19 0
      nmad/examples/mpi_lu/pdlu.c
  23. 19 0
      nmad/examples/mpi_lu/pdlu_kernels.c
  24. 581 0
      nmad/examples/mpi_lu/plu_example.c
  25. 19 0
      nmad/examples/mpi_lu/plu_example_double.c
  26. 19 0
      nmad/examples/mpi_lu/plu_example_float.c
  27. 393 0
      nmad/examples/mpi_lu/plu_solve.c
  28. 19 0
      nmad/examples/mpi_lu/plu_solve_double.c
  29. 19 0
      nmad/examples/mpi_lu/plu_solve_float.c
  30. 19 0
      nmad/examples/mpi_lu/pslu.c
  31. 19 0
      nmad/examples/mpi_lu/pslu_kernels.c
  32. 870 0
      nmad/examples/mpi_lu/pxlu.c
  33. 68 0
      nmad/examples/mpi_lu/pxlu.h
  34. 442 0
      nmad/examples/mpi_lu/pxlu_kernels.c
  35. 32 0
      nmad/examples/mpi_lu/pxlu_kernels.h
  36. 19 0
      nmad/examples/mpi_lu/slu_kernels.c
  37. 106 0
      nmad/examples/perf.sh
  38. 258 0
      nmad/examples/stencil/stencil5.c
  39. 134 0
      nmad/include/starpu_mpi.h
  40. 29 0
      nmad/libstarpumpi.pc.in
  41. 58 0
      nmad/src/Makefile.am
  42. 1253 0
      nmad/src/starpu_mpi.c
  43. 292 0
      nmad/src/starpu_mpi_cache.c
  44. 55 0
      nmad/src/starpu_mpi_cache.h
  45. 69 0
      nmad/src/starpu_mpi_cache_stats.c
  46. 40 0
      nmad/src/starpu_mpi_cache_stats.h
  47. 162 0
      nmad/src/starpu_mpi_collective.c
  48. 245 0
      nmad/src/starpu_mpi_datatype.c
  49. 35 0
      nmad/src/starpu_mpi_datatype.h
  50. 116 0
      nmad/src/starpu_mpi_fxt.h
  51. 105 0
      nmad/src/starpu_mpi_helper.c
  52. 25 0
      nmad/src/starpu_mpi_private.c
  53. 173 0
      nmad/src/starpu_mpi_private.h
  54. 117 0
      nmad/src/starpu_mpi_select_node.c
  55. 36 0
      nmad/src/starpu_mpi_select_node.h
  56. 94 0
      nmad/src/starpu_mpi_stats.c
  57. 36 0
      nmad/src/starpu_mpi_stats.h
  58. 775 0
      nmad/src/starpu_mpi_task_insert.c
  59. 32 0
      nmad/src/starpu_mpi_task_insert.h
  60. 29 0
      nmad/starpumpi-1.0.pc.in
  61. 29 0
      nmad/starpumpi-1.1.pc.in
  62. 29 0
      nmad/starpumpi-1.2.pc.in
  63. 29 0
      nmad/starpumpi-1.3.pc.in
  64. 1 0
      nmad/tests/.gitignore
  65. 246 0
      nmad/tests/Makefile.am
  66. 146 0
      nmad/tests/block_interface.c
  67. 150 0
      nmad/tests/block_interface_pinned.c
  68. 110 0
      nmad/tests/cache.c
  69. 93 0
      nmad/tests/cache_disable.c
  70. 112 0
      nmad/tests/comm.c
  71. 333 0
      nmad/tests/datatypes.c
  72. 26 0
      nmad/tests/helper.h
  73. 140 0
      nmad/tests/insert_task.c
  74. 162 0
      nmad/tests/insert_task_block.c
  75. 150 0
      nmad/tests/insert_task_cache.c
  76. 142 0
      nmad/tests/insert_task_compute.c
  77. 116 0
      nmad/tests/insert_task_count.c
  78. 169 0
      nmad/tests/insert_task_owner.c
  79. 126 0
      nmad/tests/insert_task_owner2.c
  80. 107 0
      nmad/tests/insert_task_owner_data.c
  81. 144 0
      nmad/tests/insert_task_recv_cache.c
  82. 150 0
      nmad/tests/insert_task_sent_cache.c
  83. 132 0
      nmad/tests/matrix.c
  84. 140 0
      nmad/tests/matrix2.c
  85. 86 0
      nmad/tests/mpi_detached_tag.c
  86. 85 0
      nmad/tests/mpi_irecv.c
  87. 103 0
      nmad/tests/mpi_irecv_detached.c
  88. 86 0
      nmad/tests/mpi_isend.c
  89. 108 0
      nmad/tests/mpi_isend_detached.c
  90. 173 0
      nmad/tests/mpi_reduction.c
  91. 76 0
      nmad/tests/mpi_reduction_kernels.c
  92. 94 0
      nmad/tests/mpi_redux.c
  93. 191 0
      nmad/tests/mpi_scatter_gather.c
  94. 93 0
      nmad/tests/mpi_test.c
  95. 97 0
      nmad/tests/multiple_send.c
  96. 85 0
      nmad/tests/pingpong.c
  97. 133 0
      nmad/tests/ring.c
  98. 137 0
      nmad/tests/ring_async.c
  99. 131 0
      nmad/tests/ring_async_implicit.c
  100. 0 0
      nmad/tests/ring_kernel.cu

+ 4 - 0
Makefile.am

@@ -39,6 +39,10 @@ if USE_MPI
 SUBDIRS += mpi
 endif
 
+if USE_NMAD
+SUBDIRS += nmad
+endif
+
 if BUILD_EXAMPLES
 SUBDIRS += examples
 endif

+ 161 - 1
configure.ac

@@ -382,6 +382,157 @@ AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
 AC_MSG_RESULT($nmaxmpidev)
 AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
 
+
+###############################################################################
+#                                                                             #
+#                                    New Madeleine                            #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(nmad, [AS_HELP_STRING([--enable-nmad],
+                              [Enable StarPU NMAD library generation (disable StarPU MPI)])],
+            [enable_nmad=$enableval],
+            [enable_nmad=no])
+
+
+#Check MPICC
+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
+           [Path of the mpicc compiler])],
+   [
+       if test x$withval = xyes; then
+           AC_MSG_ERROR(--with-mpicc must be given a pathname)
+       else
+           mpicc_path=$withval
+       fi
+   ],
+   [
+       if test x$enable_simgrid = xyes ; then
+           DEFAULT_MPICC=smpicc
+       else
+           DEFAULT_MPICC=mpicc
+       fi
+       # nothing was specified: default value is used
+       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
+   ])
+
+# We test if the MPICC compiler exists
+if test ! -x $mpicc_path; then
+    #MPICC does not exists or is not executable
+    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
+    use_nmad=no
+else
+    use_nmad=yes
+    if test x$enable_simgrid = xyes ; then
+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
+                    [Path of the smpirun helper])],
+            [
+                if test x$withval = xyes; then
+                    AC_MSG_ERROR(--with-smpirun must be given a pathname)
+                else
+                    smpirun_path=$withval
+                fi
+            ],
+            [
+                # nothing was specified: default value is used
+                AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
+            ])
+
+    fi
+fi
+
+AC_MSG_CHECKING(mpicc path)
+AC_MSG_RESULT($mpicc_path)
+AC_SUBST(MPICC, $mpicc_path)
+
+
+#Check MPICXX/MPIC++
+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
+           [Path of the mpicxx/mpic++ compiler])],
+   [
+       if test x$withval = xyes; then
+           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
+       else
+           mpicxx_path=$withval
+       fi
+   ],
+   [
+       if test x$enable_simgrid = xyes ; then
+           DEFAULT_MPICXX=smpicxx
+       else
+           DEFAULT_MPICXX=mpicxx
+       fi
+       # nothing was specified: default value is used
+       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+
+       # try with mpic++ if mpicxx was not found
+       if test x$mpicxx_path = xno ; then
+            DEFAULT_MPICXX=mpic++
+            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+       fi
+   ])
+
+# We test if the MPICXX/MPIC++ compiler exists
+if test ! -x $mpicxx_path; then
+    #MPICXX/MPIC++ does not exists or is not executable
+    AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
+    use_mpicxx=no
+else
+    use_mpicxx=yes
+fi
+
+AC_MSG_CHECKING(mpicxx/mpic++ path)
+AC_MSG_RESULT($mpicxx_path)
+AC_SUBST(MPICXX, $mpicxx_path)
+
+
+if test x$use_mpi = xyes -a \( x$enable_nmad = xyes  \) ; then
+    cc_or_mpicc=$mpicc_path
+        # For some reason, libtool uses gcc instead of mpicc when linking
+        # libstarpumpi.
+        # On Darwin (and maybe other systems ?) the linker will fail (undefined
+        # references to MPI_*). We manually add the required flags to fix this
+        # issue.
+        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
+else
+    cc_or_mpicc=$CC
+fi
+
+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+
+AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
+				   [Enable StarPU MPI pedantic isend])],
+				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
+if  test x$enable_mpi_pedantic_isend = xyes; then
+	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
+fi
+
+AC_ARG_WITH(mpi-master-slave-multiple-thread, [AS_HELP_STRING([--with-mpi-master-slave-multiple-thread])],
+	[AC_DEFINE([STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD], [1], [Use multiple threads to communicate with slaves])])
+
+AC_MSG_CHECKING(whether the master-slave mode should be enabled)
+AC_MSG_RESULT($build_mpi_master_slave)
+AM_CONDITIONAL([STARPU_USE_MPI_MASTER_SLAVE], [test x$build_mpi_master_slave = xyes])
+
+AC_MSG_CHECKING(maximum number of MPI master-slave devices)
+AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
+			[maximum number of MPI master-slave devices])],
+			nmaxmpidev=$enableval,
+            [
+                 nmaxmpidev=0
+            ])
+AC_MSG_RESULT($nmaxmpidev)
+AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
+
+if test x$use_mpi = xyes -a x$enable_nmad = xyes ; then
+    build_nmad_lib=yes
+    enable_mpi=no
+else
+    build_nmad_lib=no
+fi
+
+AM_CONDITIONAL(USE_NMAD, test x$build_nmad_lib = xyes)
+
+
 ###############################################################################
 #                                                                             #
 #                                LIBTOOLS                                     #
@@ -505,7 +656,7 @@ AC_MSG_RESULT($build_mpi_lib)
 
 AC_SUBST(USE_MPI, $build_mpi_lib)
 AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes)
-if test x$build_mpi_lib = xyes; then
+if test x$build_mpi_lib = xyes || test x$build_nmad_lib = xyes; then
 	AC_DEFINE(STARPU_USE_MPI,[1],[whether the StarPU MPI library is available])
 else
 	running_mpi_check=no
@@ -3151,6 +3302,11 @@ AC_OUTPUT([
 	mpi/starpumpi-1.1.pc
 	mpi/starpumpi-1.2.pc
 	mpi/starpumpi-1.3.pc
+	nmad/libstarpumpi.pc
+	nmad/starpumpi-1.0.pc
+	nmad/starpumpi-1.1.pc
+	nmad/starpumpi-1.2.pc
+	nmad/starpumpi-1.3.pc
 	starpufft/Makefile
 	starpufft/src/Makefile
 	starpufft/tests/Makefile
@@ -3169,6 +3325,10 @@ AC_OUTPUT([
 	mpi/src/Makefile
 	mpi/tests/Makefile
 	mpi/examples/Makefile
+	nmad/Makefile
+	nmad/src/Makefile
+	nmad/tests/Makefile
+	nmad/examples/Makefile
 	starpu-top/StarPU-Top.pro
 	starpu-top/StarPU-Top-qwt-embed.pri
 	starpu-top/StarPU-Top-qwt-system.pri

+ 5 - 0
examples/stencil/Makefile.am

@@ -32,6 +32,11 @@ LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 AM_CPPFLAGS += -I$(top_srcdir)/mpi/include
 endif
 
+if USE_NMAD
+LIBS += $(top_builddir)/nmad/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+AM_CPPFLAGS += -I$(top_srcdir)/nmad/include
+endif
+
 CC = $(CC_OR_MPICC)
 
 if STARPU_USE_CUDA

+ 1 - 0
nmad/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 31 - 0
nmad/Makefile.am

@@ -0,0 +1,31 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2013  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+SUBDIRS=src tests examples
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc starpumpi-1.1.pc  starpumpi-1.2.pc starpumpi-1.3.pc
+
+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
+versinclude_HEADERS = 					\
+	include/starpu_mpi.h
+
+showcheck:
+	RET=0 ; \
+	for i in $(SUBDIRS) ; do \
+		make -C $$i showcheck || RET=1 ; \
+	done ; \
+	exit $$RET

+ 193 - 0
nmad/examples/Makefile.am

@@ -0,0 +1,193 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2013, 2016  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+include $(top_srcdir)/starpu.mk
+
+CC=$(MPICC)
+CCLD=$(MPICC)
+
+if STARPU_HAVE_WINDOWS
+LOADER_BIN		=
+else
+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+LOADER			=	loader
+LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
+loader_SOURCES		=	../../tests/loader.c
+endif
+
+if STARPU_HAVE_AM111
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+LOG_COMPILER	 	=	$(MPIEXEC) -np 2 $(LOADER_BIN)
+else
+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPIEXEC) -np 4
+endif
+
+if !STARPU_SIMGRID
+if STARPU_MPI_CHECK
+TESTS			=	$(starpu_mpi_EXAMPLES)
+endif
+endif
+
+check_PROGRAMS = $(LOADER) $(starpu_mpi_EXAMPLES)
+starpu_mpi_EXAMPLES =
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo
+
+EXTRA_DIST = 					\
+	mpi_lu/mpi_lu-float.h		\
+	mpi_lu/mpi_lu-double.h		\
+	mpi_lu/plu_example.c		\
+	mpi_lu/plu_solve.c		\
+	mpi_lu/pxlu.h			\
+	mpi_lu/pxlu.c			\
+	mpi_lu/pxlu_kernels.h		\
+	mpi_lu/pxlu_kernels.c		\
+	matrix_decomposition/mpi_cholesky_codelets.h 	\
+	matrix_decomposition/mpi_cholesky_kernels.h	\
+	matrix_decomposition/mpi_cholesky_models.h 	\
+	matrix_decomposition/mpi_decomposition_params.h	\
+	matrix_decomposition/mpi_decomposition_matrix.h	\
+	../tests/helper.h
+
+examplebindir = $(libdir)/starpu/mpi
+
+examplebin_PROGRAMS =
+
+if STARPU_USE_CUDA
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
+
+.cu.cubin:
+	$(MKDIR_P) `dirname $@`
+	$(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
+
+.cu.o:
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
+endif
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
+
+###################
+# Stencil example #
+###################
+if BUILD_EXAMPLES
+examplebin_PROGRAMS +=				\
+	stencil/stencil5
+
+stencil_stencil5_LDADD =		\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
+
+starpu_mpi_EXAMPLES	+=	\
+	stencil/stencil5
+
+##################
+# MPI LU example #
+##################
+
+if !NO_BLAS_LIB
+
+examplebin_PROGRAMS += 			\
+	mpi_lu/plu_example_float	\
+	mpi_lu/plu_example_double
+
+mpi_lu_plu_example_float_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_example_float_SOURCES =	\
+	mpi_lu/plu_example_float.c	\
+	mpi_lu/plu_solve_float.c	\
+	mpi_lu/pslu_kernels.c		\
+	mpi_lu/pslu.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_example_double_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_example_double_SOURCES =	\
+	mpi_lu/plu_example_double.c	\
+	mpi_lu/plu_solve_double.c  	\
+	mpi_lu/pdlu_kernels.c	    	\
+	mpi_lu/pdlu.c		    	\
+	$(top_srcdir)/examples/common/blas.c
+endif
+
+########################
+# MPI Cholesky example #
+########################
+
+if !NO_BLAS_LIB
+examplebin_PROGRAMS +=		\
+	matrix_decomposition/mpi_cholesky			\
+	matrix_decomposition/mpi_cholesky_distributed
+
+matrix_decomposition_mpi_cholesky_SOURCES	=		\
+	matrix_decomposition/mpi_cholesky.c		\
+	matrix_decomposition/mpi_cholesky_models.c		\
+	matrix_decomposition/mpi_cholesky_kernels.c	\
+	matrix_decomposition/mpi_cholesky_codelets.c	\
+	matrix_decomposition/mpi_decomposition_params.c	\
+	matrix_decomposition/mpi_decomposition_matrix.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+matrix_decomposition_mpi_cholesky_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
+	matrix_decomposition/mpi_cholesky_distributed.c	\
+	matrix_decomposition/mpi_cholesky_models.c		\
+	matrix_decomposition/mpi_cholesky_kernels.c	\
+	matrix_decomposition/mpi_cholesky_codelets.c	\
+	matrix_decomposition/mpi_decomposition_params.c	\
+	matrix_decomposition/mpi_decomposition_matrix.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+starpu_mpi_EXAMPLES +=				\
+	matrix_decomposition/mpi_cholesky			\
+	matrix_decomposition/mpi_cholesky_distributed
+endif
+
+###################
+# complex example #
+###################
+
+examplebin_PROGRAMS +=			\
+	complex/mpi_complex
+
+complex_mpi_complex_SOURCES =		\
+	complex/mpi_complex.c		\
+	$(top_srcdir)/examples/interface/complex_interface.c
+
+complex_mpi_complex_LDADD =		\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+starpu_mpi_EXAMPLES	+=			\
+	complex/mpi_complex
+endif
+
+

+ 102 - 0
nmad/examples/complex/mpi_complex.c

@@ -0,0 +1,102 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <interface/complex_interface.h>
+#include <interface/complex_codelet.h>
+
+void display_foo_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *foo = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	fprintf(stderr, "foo = %d\n", *foo);
+}
+
+struct starpu_codelet foo_display =
+{
+	.cpu_funcs = {display_foo_codelet},
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+int main(int argc, char **argv)
+{
+	int rank, nodes;
+	int ret;
+	int compare;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+
+	if (nodes < 2)
+	{
+		fprintf(stderr, "This program needs at least 2 nodes (%d available)\n", nodes);
+		ret = 77;
+	}
+	else
+	{
+		starpu_data_handle_t handle;
+		starpu_data_handle_t handle2;
+
+		double real[2] = {4.0, 2.0};
+		double imaginary[2] = {7.0, 9.0};
+
+		double real2[2] = {14.0, 12.0};
+		double imaginary2[2] = {17.0, 19.0};
+
+		if (rank == 1)
+		{
+			real[0] = 0.0;
+			real[1] = 0.0;
+			imaginary[0] = 0.0;
+			imaginary[1] = 0.0;
+		}
+
+		starpu_complex_data_register(&handle, 0, real, imaginary, 2);
+		starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
+
+		if (rank == 0)
+		{
+			int *compare_ptr = &compare;
+
+			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle, 0);
+			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
+			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
+
+			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle2, 0);
+			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
+		}
+		else if (rank == 1)
+		{
+			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
+			starpu_insert_task(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle, 0);
+			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
+		}
+
+		starpu_task_wait_for_all();
+
+		starpu_data_unregister(handle);
+		starpu_data_unregister(handle2);
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	if (rank == 0) return !compare; else return ret;
+}

+ 72 - 0
nmad/examples/matrix_decomposition/mpi_cholesky.c

@@ -0,0 +1,72 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "mpi_cholesky_models.h"
+#include "mpi_cholesky_codelets.h"
+#include "mpi_decomposition_matrix.h"
+#include "mpi_decomposition_params.h"
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	float ***bmat;
+	int rank, nodes, ret;
+	double timing, flops;
+	int correctness;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+	starpu_cublas_init();
+
+	parse_args(argc, argv, nodes);
+
+	matrix_init(&bmat, rank, nodes, 1);
+	matrix_display(bmat, rank);
+
+	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
+
+	starpu_mpi_shutdown();
+
+	matrix_display(bmat, rank);
+
+	dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops);
+
+	matrix_free(&bmat, rank, nodes, 1);
+	starpu_cublas_shutdown();
+	starpu_shutdown();
+
+	assert(correctness);
+
+	if (rank == 0)
+	{
+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
+	}
+
+	return 0;
+}

+ 260 - 0
nmad/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -0,0 +1,260 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <common/blas.h>
+#include "mpi_decomposition_params.h"
+#include "mpi_decomposition_matrix.h"
+#include "mpi_cholesky_models.h"
+#include "mpi_cholesky_codelets.h"
+#include "mpi_cholesky_kernels.h"
+#include <sys/time.h>
+
+/*
+ *	Create the codelets
+ */
+
+static struct starpu_codelet cl11 =
+{
+	.cpu_funcs = {chol_cpu_codelet_update_u11},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &chol_model_11
+};
+
+static struct starpu_codelet cl21 =
+{
+	.cpu_funcs = {chol_cpu_codelet_update_u21},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &chol_model_21
+};
+
+static struct starpu_codelet cl22 =
+{
+	.cpu_funcs = {chol_cpu_codelet_update_u22},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22},
+#endif
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &chol_model_22
+};
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops)
+{
+	double start;
+	double end;
+	starpu_data_handle_t **data_handles;
+	unsigned x,y,i,j,k;
+
+	/* create all the DAG nodes */
+
+	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
+	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
+
+	for(x = 0; x < nblocks ; x++)
+	{
+		for (y = 0; y < nblocks; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (mpi_rank == rank)
+			{
+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
+						ld, size/nblocks, size/nblocks, sizeof(float));
+			}
+#warning TODO: make better test to only register what is needed
+			else
+			{
+				/* I don't own that index, but will need it for my computations */
+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
+						ld, size/nblocks, size/nblocks, sizeof(float));
+			}
+			if (data_handles[x][y])
+			{
+				starpu_mpi_data_register(data_handles[x][y], (y*nblocks)+x, mpi_rank);
+			}
+		}
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	start = starpu_timing_now();
+
+	for (k = 0; k < nblocks; k++)
+	{
+		int prio = STARPU_DEFAULT_PRIO;
+		if (!noprio) prio = STARPU_MAX_PRIO;
+
+		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
+				STARPU_PRIORITY, prio,
+				STARPU_RW, data_handles[k][k],
+				0);
+
+		for (j = k+1; j<nblocks; j++)
+		{
+			prio = STARPU_DEFAULT_PRIO;
+			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
+			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
+					STARPU_PRIORITY, prio,
+					STARPU_R, data_handles[k][k],
+					STARPU_RW, data_handles[k][j],
+					0);
+
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+				{
+					prio = STARPU_DEFAULT_PRIO;
+					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
+					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
+							STARPU_PRIORITY, prio,
+							STARPU_R, data_handles[k][i],
+							STARPU_R, data_handles[k][j],
+							STARPU_RW, data_handles[i][j],
+							0);
+				}
+			}
+
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][j]);
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+	for(x = 0; x < nblocks ; x++)
+	{
+		for (y = 0; y < nblocks; y++)
+		{
+			if (data_handles[x][y])
+				starpu_data_unregister(data_handles[x][y]);
+		}
+		free(data_handles[x]);
+	}
+	free(data_handles);
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	end = starpu_timing_now();
+
+	if (rank == 0)
+	{
+		*timing = end - start;
+		*flops = (1.0f*size*size*size)/3.0f;
+	}
+}
+
+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops)
+{
+	unsigned i,j,x,y;
+	float *rmat = malloc(size*size*sizeof(float));
+
+	for(x=0 ; x<nblocks ; x++)
+	{
+		for(y=0 ; y<nblocks ; y++)
+		{
+			for (i = 0; i < BLOCKSIZE; i++)
+			{
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = matA[x][y][j +i*BLOCKSIZE];
+				}
+			}
+		}
+	}
+
+	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i > j)
+			{
+				rmat[j+i*size] = 0.0f; // debug
+			}
+		}
+	}
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	STARPU_SSYRK("L", "N", size, size, 1.0f,
+			rmat, size, 0.0f, test_mat, size);
+
+	fprintf(stderr, "[%d] comparing results ...\n", rank);
+	if (display)
+	{
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < size; i++)
+			{
+				if (i <= j)
+				{
+					printf("%2.2f\t", test_mat[j +i*size]);
+				}
+				else
+				{
+					printf(".\t");
+				}
+			}
+			printf("\n");
+		}
+	}
+
+	*correctness = 1;
+	for(x = 0; x < nblocks ; x++)
+	{
+		for (y = 0; y < nblocks; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (mpi_rank == rank)
+			{
+				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
+				{
+					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
+					{
+						if (i <= j)
+						{
+							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+							float err = abs(test_mat[j +i*size] - orig);
+							if (err > 0.00001)
+							{
+								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+								*correctness = 0;
+								*flops = 0;
+								break;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	free(rmat);
+	free(test_mat);
+}

+ 30 - 0
nmad/examples/matrix_decomposition/mpi_cholesky_codelets.h

@@ -0,0 +1,30 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MPI_CHOLESKY_CODELETS_H__
+#define __MPI_CHOLESKY_CODELETS_H__
+
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops);
+
+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops);
+
+#endif /* __MPI_CHOLESKY_CODELETS_H__ */

+ 64 - 0
nmad/examples/matrix_decomposition/mpi_cholesky_distributed.c

@@ -0,0 +1,64 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "mpi_cholesky_models.h"
+#include "mpi_cholesky_codelets.h"
+#include "mpi_decomposition_matrix.h"
+#include "mpi_decomposition_params.h"
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	float ***bmat;
+	int rank, nodes, ret;
+	double timing, flops;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+	starpu_cublas_init();
+
+	parse_args(argc, argv, nodes);
+
+	matrix_init(&bmat, rank, nodes, 0);
+
+	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
+
+	starpu_mpi_shutdown();
+
+	matrix_free(&bmat, rank, nodes, 0);
+	starpu_cublas_shutdown();
+	starpu_shutdown();
+
+	if (rank == 0)
+	{
+		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
+	}
+
+	return 0;
+}

+ 247 - 0
nmad/examples/matrix_decomposition/mpi_cholesky_kernels.c

@@ -0,0 +1,247 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2012-2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <math.h>
+#include "mpi_decomposition_params.h"
+#include "common/blas.h"
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#ifdef STARPU_HAVE_MAGMA
+#include "magma.h"
+#include "magma_lapack.h"
+#endif
+#endif
+
+/*
+ * U22
+ */
+
+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	//printf("22\n");
+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
+
+#ifdef STARPU_USE_CUDA
+	cublasStatus st;
+#endif
+
+	switch (s)
+	{
+		case 0:
+			STARPU_SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
+				right, ld12, 1.0f, center, ld22);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+#ifdef STARPU_HAVE_MAGMA
+			cublasSetKernelStream(starpu_cuda_get_local_stream());
+#endif
+			cublasSgemm('n', 't', dy, dx, dz,
+					-1.0f, left, ld21, right, ld12,
+					 1.0f, center, ld22);
+			st = cublasGetError();
+			if (STARPU_UNLIKELY(st != CUBLAS_STATUS_SUCCESS))
+				STARPU_CUBLAS_REPORT_ERROR(st);
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA
+
+/*
+ * U21
+ */
+
+static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+//	printf("21\n");
+	float *sub11;
+	float *sub21;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
+
+	switch (s)
+	{
+		case 0:
+			STARPU_STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+#ifdef STARPU_HAVE_MAGMA
+			cublasSetKernelStream(starpu_cuda_get_local_stream());
+#endif
+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
+{
+	 chol_common_codelet_update_u21(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u21(descr, 1, _args);
+}
+#endif
+
+/*
+ *	U11
+ */
+
+static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+//	printf("11\n");
+	float *sub11;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+
+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
+
+	unsigned z;
+
+	switch (s)
+	{
+		case 0:
+
+#ifdef STARPU_MKL
+			STARPU_SPOTRF("L", nx, sub11, ld);
+#else
+			/*
+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
+			 *	- A22 <- A22 - l21 trans(l21)
+			 */
+
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				lambda11 = sqrt(sub11[z+z*ld]);
+				sub11[z+z*ld] = lambda11;
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+
+				STARPU_SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+
+				STARPU_SSYR("L", nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+#endif
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+#ifdef STARPU_HAVE_MAGMA
+			{
+				int ret;
+				int info;
+				ret = magma_spotrf_gpu(MagmaLower, nx, sub11, ld, &info);
+				if (ret != MAGMA_SUCCESS)
+				{
+					fprintf(stderr, "Error in Magma: %d\n", ret);
+					STARPU_ABORT();
+				}
+				cudaError_t cures = cudaThreadSynchronize();
+				STARPU_ASSERT(!cures);
+			}
+#else
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+
+				lambda11 = sqrt(lambda11);
+
+				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
+
+				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+
+				cublasSsyr('U', nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+#endif
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+
+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA

+ 33 - 0
nmad/examples/matrix_decomposition/mpi_cholesky_kernels.h

@@ -0,0 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MPI_CHOLESKY_KERNELS_H__
+#define __MPI_CHOLESKY_KERNELS_H__
+
+#include <starpu.h>
+
+void chol_cpu_codelet_update_u11(void **, void *);
+void chol_cpu_codelet_update_u21(void **, void *);
+void chol_cpu_codelet_update_u22(void **, void *);
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
+#endif
+
+#endif // __MPI_CHOLESKY_KERNELS_H__

+ 40 - 0
nmad/examples/matrix_decomposition/mpi_cholesky_models.c

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_cholesky_models.h"
+
+/*
+ *	Number of flops of Gemm
+ */
+
+struct starpu_perfmodel chol_model_11 =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_11"
+};
+
+struct starpu_perfmodel chol_model_21 =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_21"
+};
+
+struct starpu_perfmodel chol_model_22 =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_22"
+};

+ 27 - 0
nmad/examples/matrix_decomposition/mpi_cholesky_models.h

@@ -0,0 +1,27 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_MODELS_H__
+#define __DW_CHOLESKY_MODELS_H__
+
+#include <starpu.h>
+
+extern struct starpu_perfmodel chol_model_11;
+extern struct starpu_perfmodel chol_model_21;
+extern struct starpu_perfmodel chol_model_22;
+
+#endif // __DW_CHOLESKY_MODELS_H__

+ 110 - 0
nmad/examples/matrix_decomposition/mpi_decomposition_matrix.c

@@ -0,0 +1,110 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012, 2015  Université de Bordeaux
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "mpi_decomposition_matrix.h"
+#include "mpi_decomposition_params.h"
+#include "mpi_cholesky_codelets.h"
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+	//return (x+y) % nb_nodes;
+	return (x%dblockx)+(y%dblocky)*dblockx;
+}
+
+
+void matrix_display(float ***bmat, int rank)
+{
+	unsigned i,j,x,y;
+
+	if (display)
+	{
+		printf("[%d] Input :\n", rank);
+
+		for(y=0 ; y<nblocks ; y++)
+		{
+			for(x=0 ; x<nblocks ; x++)
+			{
+				printf("Block %u,%u :\n", x, y);
+				for (j = 0; j < BLOCKSIZE; j++)
+				{
+					for (i = 0; i < BLOCKSIZE; i++)
+					{
+						if (i <= j)
+						{
+							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
+						}
+						else
+						{
+							printf(".\t");
+						}
+					}
+					printf("\n");
+				}
+			}
+		}
+	}
+}
+
+void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
+{
+	unsigned i,j,x,y;
+
+	*bmat = malloc(nblocks * sizeof(float **));
+	for(x=0 ; x<nblocks ; x++)
+	{
+		(*bmat)[x] = malloc(nblocks * sizeof(float *));
+		for(y=0 ; y<nblocks ; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (alloc_everywhere || (mpi_rank == rank))
+			{
+				starpu_malloc((void **)&(*bmat)[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+				for (i = 0; i < BLOCKSIZE; i++)
+				{
+					for (j = 0; j < BLOCKSIZE; j++)
+					{
+						(*bmat)[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
+						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+					}
+				}
+			}
+		}
+	}
+}
+
+void matrix_free(float ****bmat, int rank, int nodes, int alloc_everywhere)
+{
+	unsigned x, y;
+
+	for(x=0 ; x<nblocks ; x++)
+	{
+		for(y=0 ; y<nblocks ; y++)
+		{
+			int mpi_rank = my_distrib(x, y, nodes);
+			if (alloc_everywhere || (mpi_rank == rank))
+			{
+				starpu_free((void *)(*bmat)[x][y]);
+			}
+		}
+		free((*bmat)[x]);
+	}
+	free(*bmat);
+}
+

+ 30 - 0
nmad/examples/matrix_decomposition/mpi_decomposition_matrix.h

@@ -0,0 +1,30 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MPI_CHOLESKY_MATRIX_H__
+#define __MPI_CHOLESKY_MATRIX_H__
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes);
+
+void matrix_display(float ***bmat, int rank);
+void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere);
+void matrix_free(float ****bmat, int rank, int nodes, int alloc_everywhere);
+
+#endif /* __MPI_CHOLESKY_MATRIX_H__ */
+

+ 100 - 0
nmad/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -0,0 +1,100 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2015  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+unsigned size = 4*960;
+unsigned nblocks = 16;
+unsigned nbigblocks = 2;
+unsigned noprio = 0;
+unsigned display = 0;
+int dblockx = -1;
+int dblocky = -1;
+
+void parse_args(int argc, char **argv, int nodes)
+{
+        int i;
+        for (i = 1; i < argc; i++)
+        {
+                if (strcmp(argv[i], "-size") == 0)
+                {
+                        char *argptr;
+                        size = strtol(argv[++i], &argptr, 10);
+                }
+
+                if (strcmp(argv[i], "-dblockx") == 0)
+                {
+                        char *argptr;
+                        dblockx = strtol(argv[++i], &argptr, 10);
+                }
+
+                if (strcmp(argv[i], "-dblocky") == 0)
+                {
+                        char *argptr;
+                        dblocky = strtol(argv[++i], &argptr, 10);
+                }
+
+                if (strcmp(argv[i], "-nblocks") == 0)
+                {
+                        char *argptr;
+                        nblocks = strtol(argv[++i], &argptr, 10);
+                }
+
+                if (strcmp(argv[i], "-nbigblocks") == 0)
+                {
+                        char *argptr;
+                        nbigblocks = strtol(argv[++i], &argptr, 10);
+                }
+
+                if (strcmp(argv[i], "-no-prio") == 0)
+                {
+                        noprio = 1;
+                }
+
+                if (strcmp(argv[i], "-display") == 0)
+                {
+                        display = 1;
+                }
+
+                if (strcmp(argv[i], "-h") == 0)
+                {
+                        printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
+                }
+        }
+
+        if (nblocks > size) nblocks = size;
+
+	if (dblockx == -1 || dblocky == -1)
+	{
+		int factor;
+		dblockx = nodes;
+		dblocky = 1;
+		for(factor=sqrt(nodes) ; factor>1 ; factor--)
+		{
+			if (nodes % factor == 0)
+			{
+				dblockx = nodes/factor;
+				dblocky = factor;
+				break;
+			}
+		}
+	}
+}
+

+ 34 - 0
nmad/examples/matrix_decomposition/mpi_decomposition_params.h

@@ -0,0 +1,34 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MPI_CHOLESKY_PARAMS_H__
+#define __MPI_CHOLESKY_PARAMS_H__
+
+#define BLOCKSIZE       (size/nblocks)
+
+extern unsigned size;
+extern unsigned nblocks;
+extern unsigned nbigblocks;
+extern unsigned noprio;
+extern unsigned display;
+extern unsigned dblockx;
+extern unsigned dblocky;
+
+void parse_args(int argc, char **argv, int nodes);
+
+#endif // __MPI_CHOLESKY_PARAMS_H__
+

+ 42 - 0
nmad/examples/mpi_lu/mpi_lu-double.h

@@ -0,0 +1,42 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE double
+#define MPI_TYPE	MPI_DOUBLE
+
+#define STARPU_PLU(name)       starpu_pdlu_##name
+
+#define CUBLAS_GEMM	cublasDgemm
+#define CUBLAS_TRSM	cublasDtrsm
+#define CUBLAS_SCAL	cublasDscal
+#define CUBLAS_GER	cublasDger
+#define CUBLAS_SWAP	cublasDswap
+#define CUBLAS_IAMAX	cublasIdamax
+
+#define CPU_GEMM	STARPU_DGEMM
+#define CPU_GEMV	STARPU_DGEMV
+#define CPU_TRSM	STARPU_DTRSM
+#define CPU_SCAL	STARPU_DSCAL
+#define CPU_GER		STARPU_DGER
+#define CPU_SWAP	STARPU_DSWAP
+
+#define CPU_TRMM	STARPU_DTRMM
+#define CPU_AXPY	STARPU_DAXPY
+#define CPU_ASUM	STARPU_DASUM
+#define CPU_IAMAX	STARPU_IDAMAX
+
+#define PIVOT_THRESHHOLD	10e-10

+ 42 - 0
nmad/examples/mpi_lu/mpi_lu-float.h

@@ -0,0 +1,42 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE float
+#define MPI_TYPE	MPI_FLOAT
+
+#define STARPU_PLU(name)       starpu_pslu_##name
+
+#define CUBLAS_GEMM	cublasSgemm
+#define CUBLAS_TRSM	cublasStrsm
+#define CUBLAS_SCAL	cublasSscal
+#define CUBLAS_GER	cublasSger
+#define CUBLAS_SWAP	cublasSswap
+#define CUBLAS_IAMAX	cublasIsamax
+
+#define CPU_GEMM	STARPU_SGEMM
+#define CPU_GEMV	STARPU_SGEMV
+#define CPU_TRSM	STARPU_STRSM
+#define CPU_SCAL	STARPU_SSCAL
+#define CPU_GER		STARPU_SGER
+#define CPU_SWAP	STARPU_SSWAP
+
+#define CPU_TRMM	STARPU_STRMM
+#define CPU_AXPY	STARPU_SAXPY
+#define CPU_ASUM	STARPU_SASUM
+#define CPU_IAMAX	STARPU_ISAMAX
+
+#define PIVOT_THRESHHOLD	10e-5

+ 19 - 0
nmad/examples/mpi_lu/pdlu.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "pxlu.c"

+ 19 - 0
nmad/examples/mpi_lu/pdlu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "pxlu_kernels.c"

+ 581 - 0
nmad/examples/mpi_lu/plu_example.c

@@ -0,0 +1,581 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+#ifdef STARPU_HAVE_LIBNUMA
+#include <numaif.h>
+#endif
+
+static unsigned long size = 4096;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static int p = 1;
+static int q = 1;
+static unsigned display = 0;
+
+#ifdef STARPU_HAVE_LIBNUMA
+static unsigned numa = 0;
+#endif
+
+static size_t allocated_memory = 0;
+static size_t allocated_memory_extra = 0;
+
+static starpu_data_handle_t *dataA_handles;
+static TYPE **dataA;
+
+/* In order to implement the distributed LU decomposition, we allocate
+ * temporary buffers */
+#ifdef SINGLE_TMP11
+static starpu_data_handle_t tmp_11_block_handle;
+static TYPE *tmp_11_block;
+#else
+static starpu_data_handle_t *tmp_11_block_handles;
+static TYPE **tmp_11_block;
+#endif
+#ifdef SINGLE_TMP1221
+static starpu_data_handle_t *tmp_12_block_handles;
+static TYPE **tmp_12_block;
+static starpu_data_handle_t *tmp_21_block_handles;
+static TYPE **tmp_21_block;
+#else
+static starpu_data_handle_t *(tmp_12_block_handles[2]);
+static TYPE **(tmp_12_block[2]);
+static starpu_data_handle_t *(tmp_21_block_handles[2]);
+static TYPE **(tmp_21_block[2]);
+#endif
+
+int get_block_rank(unsigned i, unsigned j);
+
+static void parse_args(int rank, int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-numa") == 0) {
+#ifdef STARPU_HAVE_LIBNUMA
+			numa = 1;
+#else
+			if (rank == 0)
+				fprintf(stderr, "Warning: libnuma is not available\n");
+#endif
+		}
+
+		if (strcmp(argv[i], "-p") == 0) {
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0) {
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
+			exit(0);
+		}
+	}
+}
+
+unsigned STARPU_PLU(display_flag)(void)
+{
+	return display;
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
+{
+	const unsigned block_size = (psize/pnblocks);
+
+	unsigned i, j;
+	for (i = 0; i < block_size; i++)
+	     for (j = 0; j < block_size; j++)
+	     {
+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	     }
+}
+
+#ifdef SINGLE_TMP11
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void)
+{
+	return tmp_11_block_handle;
+}
+#else
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k)
+{
+	return tmp_11_block_handles[k];
+}
+#endif
+
+#ifdef SINGLE_TMP1221
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
+{
+	return tmp_12_block_handles[j];
+}
+
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
+{
+	return tmp_21_block_handles[i];
+}
+#else
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k)
+{
+	return tmp_12_block_handles[k%2][j];
+}
+
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k)
+{
+	return tmp_21_block_handles[k%2][i];
+}
+#endif
+
+static unsigned tmp_11_block_is_needed(int rank, unsigned pnblocks, unsigned k)
+{
+	return 1;
+}
+
+static unsigned tmp_12_block_is_needed(int rank, unsigned pnblocks, unsigned j)
+{
+	unsigned i;
+	for (i = 1; i < pnblocks; i++)
+	{
+		if (get_block_rank(i, j) == rank)
+			return 1;
+	}
+
+	return 0;
+}
+
+static unsigned tmp_21_block_is_needed(int rank, unsigned pnblocks, unsigned i)
+{
+	unsigned j;
+	for (j = 1; j < pnblocks; j++)
+	{
+		if (get_block_rank(i, j) == rank)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void init_matrix(int rank)
+{
+#ifdef STARPU_HAVE_LIBNUMA
+	if (numa)
+	{
+		fprintf(stderr, "Using INTERLEAVE policy\n");
+		unsigned long nodemask = ((1<<0)|(1<<1));
+		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
+		if (ret)
+			perror("set_mempolicy failed");
+	}
+#endif
+
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
+	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			TYPE **blockptr = &dataA[j+i*nblocks];
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+
+			if (get_block_rank(i, j) == rank)
+			{
+				/* This blocks should be treated by the current MPI process */
+				/* Allocate and fill it */
+				starpu_malloc((void **)blockptr, blocksize);
+				allocated_memory += blocksize;
+
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				fill_block_with_random(*blockptr, size, nblocks);
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				if (i == j)
+				{
+					unsigned tmp;
+					for (tmp = 0; tmp < size/nblocks; tmp++)
+					{
+						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+					}
+				}
+
+				/* Register it to StarPU */
+				starpu_matrix_data_register(handleptr, 0,
+					(uintptr_t)*blockptr, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			else {
+				*blockptr = STARPU_POISON_PTR;
+				*handleptr = STARPU_POISON_PTR;
+			}
+		}
+	}
+
+	/* Allocate the temporary buffers required for the distributed algorithm */
+
+	unsigned k;
+
+	/* tmp buffer 11 */
+#ifdef SINGLE_TMP11
+	starpu_malloc((void **)&tmp_11_block, blocksize);
+	allocated_memory_extra += blocksize;
+	starpu_matrix_data_register(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+#else
+	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
+	tmp_11_block = calloc(nblocks, sizeof(TYPE *));
+	allocated_memory_extra += nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+
+	for (k = 0; k < nblocks; k++)
+	{
+		if (tmp_11_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_11_block[k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_11_block[k]);
+
+			starpu_matrix_data_register(&tmp_11_block_handles[k], 0,
+				(uintptr_t)tmp_11_block[k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+	}
+#endif
+
+	/* tmp buffers 12 and 21 */
+#ifdef SINGLE_TMP1221
+	tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
+	tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
+	tmp_12_block = calloc(nblocks, sizeof(TYPE *));
+	tmp_21_block = calloc(nblocks, sizeof(TYPE *));
+
+	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+#else
+	for (i = 0; i < 2; i++) {
+		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
+		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
+		tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *));
+		tmp_21_block[i] = calloc(nblocks, sizeof(TYPE *));
+
+		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+	}
+#endif
+
+	for (k = 0; k < nblocks; k++)
+	{
+#ifdef SINGLE_TMP1221
+		if (tmp_12_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_12_block[k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_12_block[k]);
+
+			starpu_matrix_data_register(&tmp_12_block_handles[k], 0,
+				(uintptr_t)tmp_12_block[k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+
+		if (tmp_21_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_21_block[k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_21_block[k]);
+
+			starpu_matrix_data_register(&tmp_21_block_handles[k], 0,
+				(uintptr_t)tmp_21_block[k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+#else
+	for (i = 0; i < 2; i++) {
+		if (tmp_12_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_12_block[i][k]);
+
+			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
+				(uintptr_t)tmp_12_block[i][k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+
+		if (tmp_21_block_is_needed(rank, nblocks, k))
+		{
+			starpu_malloc((void **)&tmp_21_block[i][k], blocksize);
+			allocated_memory_extra += blocksize;
+			STARPU_ASSERT(tmp_21_block[i][k]);
+
+			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
+				(uintptr_t)tmp_21_block[i][k],
+				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+		}
+	}
+#endif
+	}
+
+	//display_all_blocks(nblocks, size/nblocks);
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	return dataA[j+i*nblocks];
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
+static void display_grid(int rank, unsigned pnblocks)
+{
+	if (!display)
+		return;
+
+	//if (rank == 0)
+	{
+		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
+
+		unsigned i, j;
+		for (j = 0; j < pnblocks; j++)
+		{
+			for (i = 0; i < pnblocks; i++)
+			{
+				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
+				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
+
+				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
+			}
+			fprintf(stderr, "\n");
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+
+	/*
+	 *	Initialization
+	 */
+	int thread_support;
+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
+		fprintf(stderr,"MPI_Init_thread failed\n");
+		exit(1);
+	}
+	if (thread_support == MPI_THREAD_FUNNELED)
+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+	if (thread_support < MPI_THREAD_FUNNELED)
+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+	starpu_srand48((long int)time(NULL));
+
+	parse_args(rank, argc, argv);
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* We disable sequential consistency in this example */
+	starpu_data_set_default_sequential_consistency_flag(0);
+
+	starpu_mpi_init(NULL, NULL, 0);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_cublas_init();
+
+	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	/*
+	 * 	Problem Init
+	 */
+
+	init_matrix(rank);
+
+	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
+                        (int)(allocated_memory/(1024*1024)),
+			(int)(allocated_memory_extra/(1024*1024)),
+                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));
+
+	display_grid(rank, nblocks);
+
+	TYPE *a_r = NULL;
+//	STARPU_PLU(display_data_content)(a_r, size);
+
+	TYPE *x, *y;
+
+	if (check)
+	{
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			unsigned ind;
+			for (ind = 0; ind < size; ind++)
+				x[ind] = (TYPE)starpu_drand48();
+		}
+
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+	}
+
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Report performance
+	 */
+
+	int reduce_ret;
+	double min_timing = timing;
+	double max_timing = timing;
+	double sum_timing = timing;
+
+	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
+
+	if (rank == 0)
+	{
+		fprintf(stderr, "Computation took: %f ms\n", max_timing/1000);
+		fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000);
+		fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000);
+		fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000));
+
+		unsigned n = size;
+		double flop = (2.0f*n*n*n)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f));
+	}
+
+	/*
+	 *	Test Result Correctness
+	 */
+
+	if (check)
+	{
+		/*
+		 *	Compute || A - LU ||
+		 */
+
+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
+
+#if 0
+		/*
+		 *	Compute || Ax - LUx ||
+		 */
+
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		/* Compute y2 = y2 - y */
+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
+
+		TYPE err = CPU_ASUM(size, y2, 1);
+		int max = CPU_IAMAX(size, y2, 1);
+
+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
+	}
+
+	/*
+	 * 	Termination
+	 */
+
+	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+#if 0
+	MPI_Finalize();
+#endif
+
+	return 0;
+}

+ 19 - 0
nmad/examples/mpi_lu/plu_example_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_example.c"

+ 19 - 0
nmad/examples/mpi_lu/plu_example_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_example.c"

+ 393 - 0
nmad/examples/mpi_lu/plu_solve.c

@@ -0,0 +1,393 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <math.h>
+#include "pxlu.h"
+
+/*
+ *	Various useful functions
+ */
+
+static double frobenius_norm(TYPE *v, unsigned n)
+{
+	double sum2 = 0.0;
+
+	/* compute sqrt(Sum(|x|^2)) */
+
+	unsigned i,j;
+	for (j = 0; j < n; j++)
+		for (i = 0; i < n; i++)
+		{
+			double a = fabsl((double)v[i+n*j]);
+			sum2 += a*a;
+		}
+
+	return sqrt(sum2);
+}
+
+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
+{
+	if (!STARPU_PLU(display_flag)())
+		return;
+
+	fprintf(stderr, "DISPLAY BLOCK\n");
+
+	unsigned i, j;
+	for (j = 0; j < blocksize; j++)
+	{
+		for (i = 0; i < blocksize; i++)
+		{
+			fprintf(stderr, "%f ", data[j+i*blocksize]);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	fprintf(stderr, "****\n");
+}
+
+void STARPU_PLU(extract_upper)(unsigned block_size, TYPE *inblock, TYPE *outblock)
+{
+	unsigned li, lj;
+	for (lj = 0; lj < block_size; lj++)
+	{
+		/* Upper block diag is 1 */
+		outblock[lj*(block_size + 1)] = (TYPE)1.0;
+
+		for (li = lj + 1; li < block_size; li++)
+		{
+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
+		}
+	}
+}
+
+void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outblock)
+{
+	unsigned li, lj;
+	for (lj = 0; lj < block_size; lj++)
+	{
+		for (li = 0; li <= lj; li++)
+		{
+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
+		}
+	}
+}
+
+/*
+ *	Compute Ax = y
+ */
+
+static void STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	fprintf(stderr, "block data %p sub x %p sub y %p\n", block_data, sub_x, sub_y);
+	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
+}
+
+static void STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	unsigned block_size = size/nblocks;
+
+	/* Take a copy of the upper part of the diagonal block */
+	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
+	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
+
+	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
+
+	free(upper_block_copy);
+}
+
+static void STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	unsigned block_size = size/nblocks;
+
+	/* Take a copy of the upper part of the diagonal block */
+	TYPE *lower_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
+	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
+
+	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
+
+	free(lower_block_copy);
+}
+
+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
+{
+	/* Create temporary buffers where all MPI processes are going to
+	 * compute Ui x = yi where Ai is the matrix containing the blocks of U
+	 * affected to process i, and 0 everywhere else. We then have y as the
+	 * sum of all yi. */
+	TYPE *yi = calloc(size, sizeof(TYPE));
+
+	fprintf(stderr, "Compute LU\n");
+
+	unsigned block_size = size/nblocks;
+
+	/* Compute UiX = Yi */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		if (get_block_rank(j, j) == rank)
+		{
+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
+			TYPE *sub_x = &x[j*(block_size)];
+			TYPE *sub_yi = &yi[j*(block_size)];
+
+			STARPU_PLU(compute_ax_block_upper)(size, nblocks, block_data, sub_x, sub_yi);
+		}
+
+		for (i = j + 1; i < nblocks; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
+				TYPE *sub_x = &x[i*(block_size)];
+				TYPE *sub_yi = &yi[j*(block_size)];
+
+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
+			}
+		}
+	}
+
+	/* Grab Sum Yi in X */
+	MPI_Reduce(yi, x, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+	memset(yi, 0, size*sizeof(TYPE));
+
+//	unsigned ind;
+//	if (rank == 0)
+//	{
+//		fprintf(stderr, "INTERMEDIATE\n");
+//		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
+//		{
+//			fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]);
+//		}
+//		fprintf(stderr, "****\n");
+//	}
+
+	/* Everyone needs x */
+	int bcst_ret;
+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
+
+	/* Compute LiX = Yi (with X = UX) */
+	for (j = 0; j < nblocks; j++)
+	{
+		if (j > 0)
+		for (i = 0; i < j; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
+				TYPE *sub_x = &x[i*(block_size)];
+				TYPE *sub_yi = &yi[j*(block_size)];
+
+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
+			}
+		}
+
+		if (get_block_rank(j, j) == rank)
+		{
+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
+			TYPE *sub_x = &x[j*(block_size)];
+			TYPE *sub_yi = &yi[j*(block_size)];
+
+			STARPU_PLU(compute_ax_block_lower)(size, nblocks, block_data, sub_x, sub_yi);
+		}
+	}
+
+	/* Grab Sum Yi in Y */
+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+	free(yi);
+}
+
+
+
+/*
+ *	Allocate a contiguous matrix on node 0 and fill it with the whole
+ *	content of the matrix distributed accross all nodes.
+ */
+
+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
+{
+//	fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks);
+
+	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
+
+	unsigned block_size = size/nblocks;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		TYPE *block;
+
+		int block_rank = get_block_rank(bi, bj);
+
+		if (block_rank == 0)
+		{
+			block = STARPU_PLU(get_block)(bi, bj);
+		}
+		else {
+			MPI_Status status;
+
+			if (rank == 0)
+			{
+				block = calloc(block_size*block_size, sizeof(TYPE));
+
+				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status);
+				STARPU_ASSERT(ret == MPI_SUCCESS);
+			}
+			else if (rank == block_rank) {
+				block = STARPU_PLU(get_block)(bi, bj);
+				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
+				STARPU_ASSERT(ret == MPI_SUCCESS);
+			}
+		}
+
+		if (rank == 0)
+		{
+			unsigned j, i;
+			for (j = 0; j < block_size; j++)
+			for (i = 0; i < block_size; i++)
+			{
+				bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
+									block[j+i*block_size];
+			}
+
+			if (get_block_rank(bi, bj) != 0)
+				free(block);
+		}
+	}
+
+	return bigmatrix;
+}
+
+/* x and y must be valid (at least) on 0 */
+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
+{
+	unsigned block_size = size/nblocks;
+
+	/* Send x to everyone */
+	int bcst_ret;
+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
+
+	/* Create temporary buffers where all MPI processes are going to
+	 * compute Ai x = yi where Ai is the matrix containing the blocks of A
+	 * affected to process i, and 0 everywhere else. We then have y as the
+	 * sum of all yi. */
+	TYPE *yi = calloc(size, sizeof(TYPE));
+
+	/* Compute Aix = yi */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
+				TYPE *sub_x = &x[i*block_size];
+				TYPE *sub_yi = &yi[j*block_size];
+
+				STARPU_PLU(compute_ax_block)(block_size, block_data, sub_x, sub_yi);
+			}
+		}
+	}
+
+	/* Compute the Sum of all yi = y */
+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+	fprintf(stderr, "RANK %d - FOO 1 y[0] %f\n", rank, y[0]);
+
+	free(yi);
+}
+
+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved)
+{
+	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+	unsigned display = STARPU_PLU(display_flag)();
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	if (rank == 0)
+	{
+		TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
+		TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
+
+		memset(L, 0, size*size*sizeof(TYPE));
+		memset(U, 0, size*size*sizeof(TYPE));
+
+		/* only keep the lower part */
+		unsigned i, j;
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < j; i++)
+			{
+				L[j+i*size] = all_r[j+i*size];
+			}
+
+			/* diag i = j */
+			L[j+j*size] = all_r[j+j*size];
+			U[j+j*size] = 1.0;
+
+			for (i = j+1; i < size; i++)
+			{
+				U[j+i*size] = all_r[j+i*size];
+			}
+		}
+
+		STARPU_PLU(display_data_content)(L, size);
+		STARPU_PLU(display_data_content)(U, size);
+
+		/* now A_err = L, compute L*U */
+		CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+
+		if (display)
+			fprintf(stderr, "\nLU\n");
+
+		STARPU_PLU(display_data_content)(L, size);
+
+		/* compute "LU - A" in L*/
+		CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
+
+		TYPE err = CPU_ASUM(size*size, L, 1);
+		int max = CPU_IAMAX(size*size, L, 1);
+
+		if (display)
+			fprintf(stderr, "DISPLAY ERROR\n");
+
+		STARPU_PLU(display_data_content)(L, size);
+
+		fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
+
+		double residual = frobenius_norm(L, size);
+		double matnorm = frobenius_norm(Asaved, size);
+
+		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
+	}
+}

+ 19 - 0
nmad/examples/mpi_lu/plu_solve_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_solve.c"

+ 19 - 0
nmad/examples/mpi_lu/plu_solve_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_solve.c"

+ 19 - 0
nmad/examples/mpi_lu/pslu.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "pxlu.c"

+ 19 - 0
nmad/examples/mpi_lu/pslu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "pxlu_kernels.c"

+ 870 - 0
nmad/examples/mpi_lu/pxlu.c

@@ -0,0 +1,870 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pxlu.h"
+#include "pxlu_kernels.h"
+#include <sys/time.h>
+
+#define MPI_TAG11(k)	((1U << 16) | (k))
+#define MPI_TAG12(k, j)	((2U << 16) | (k)<<8 | (j))
+#define MPI_TAG21(k, i)	((3U << 16) | (i)<<8 | (k))
+
+// 11 21
+// 12 22
+
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<50) | (unsigned long long)(k)))
+#define TAG12(k,j)	((starpu_tag_t)(((2ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG21(k,i)	((starpu_tag_t)(((3ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<50) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+#define TAG11_SAVE(k)	((starpu_tag_t)( (5ULL<<50) | (unsigned long long)(k)))
+#define TAG12_SAVE(k,j)	((starpu_tag_t)(((6ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG21_SAVE(k,i)	((starpu_tag_t)(((7ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+
+#define TAG11_SAVE_PARTIAL(k)	((starpu_tag_t)( (8ULL<<50) | (unsigned long long)(k)))
+#define TAG12_SAVE_PARTIAL(k,j)	((starpu_tag_t)(((9ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG21_SAVE_PARTIAL(k,i)	((starpu_tag_t)(((10ULL<<50) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+
+#define STARPU_TAG_INIT	((starpu_tag_t)(11ULL<<50))
+
+//#define VERBOSE_INIT	1
+
+//#define DEBUG	1
+
+static unsigned no_prio = 0;
+
+static unsigned nblocks = 0;
+static int rank = -1;
+static int world_size = -1;
+
+struct callback_arg {
+	unsigned i, j, k;
+};
+
+/*
+ *	Various
+ */
+
+static struct debug_info *create_debug_info(unsigned i, unsigned j, unsigned k)
+{
+	struct debug_info *info = malloc(sizeof(struct debug_info));
+
+	info->i = i;
+	info->j = j;
+	info->k = k;
+
+	return info;
+}
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+
+	task->use_tag = 1;
+	task->tag_id = id;
+
+	return task;
+}
+
+/* Send handle to every node appearing in the mask, and unlock tag once the
+ * transfers are done. */
+static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
+{
+	unsigned cnt = 0;
+
+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
+
+	int rank_array[world_size];
+	MPI_Comm comm_array[world_size];
+	int mpi_tag_array[world_size];
+	starpu_data_handle_t handle_array[world_size];
+
+	int r;
+	for (r = 0; r < world_size; r++)
+	{
+		if (rank_mask[r]) {
+			rank_array[cnt] = r;
+
+			comm_array[cnt] = MPI_COMM_WORLD;
+			mpi_tag_array[cnt] = mpi_tag;
+			handle_array[cnt] = handle;
+			cnt++;
+		}
+	}
+
+	if (cnt == 0)
+	{
+		/* In case there is no message to send, we release the tag at
+		 * once */
+		starpu_tag_notify_from_apps(tag);
+	}
+	else {
+		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
+				rank_array, mpi_tag_array, comm_array, tag);
+	}
+}
+
+/* Initiate a receive request once all dependencies are fulfilled and unlock
+ * tag 'unlocked_tag' once it's done. */
+
+struct recv_when_done_callback_arg {
+	int source;
+	int mpi_tag;
+	starpu_data_handle_t handle;
+	starpu_tag_t unlocked_tag;
+};
+
+static void callback_receive_when_done(void *_arg)
+{
+	struct recv_when_done_callback_arg *arg = _arg;
+
+	starpu_mpi_irecv_detached_unlock_tag(arg->handle, arg->source,
+			arg->mpi_tag, MPI_COMM_WORLD, arg->unlocked_tag);
+
+	free(arg);
+}
+
+static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
+				int source, int mpi_tag,
+				starpu_data_handle_t handle,
+				starpu_tag_t partial_tag,
+				starpu_tag_t unlocked_tag)
+{
+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
+
+	struct recv_when_done_callback_arg *arg =
+		malloc(sizeof(struct recv_when_done_callback_arg));
+	
+	arg->source = source;
+	arg->mpi_tag = mpi_tag;
+	arg->handle = handle;
+	arg->unlocked_tag = unlocked_tag;
+
+	if (ndeps == 0)
+	{
+		callback_receive_when_done(arg);
+		return;
+	}
+
+	starpu_create_sync_task(partial_tag, ndeps, deps_tags,
+					callback_receive_when_done, arg);
+}
+
+/*
+ *	Task 11 (diagonal factorization)
+ */
+
+static void create_task_11_recv(unsigned k)
+{
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 11_(k-1) can be used by 12_(k-1)j and
+	 * 21(k-1)i with i,j >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[2*nblocks];
+	
+#ifdef SINGLE_TMP11
+	unsigned i, j;
+	if (k > 0)
+	for (i = (k-1)+1; i < nblocks; i++)
+	{
+		if (rank == get_block_rank(i, k-1))
+			tag_array[ndeps++] = TAG21(k-1, i);
+	}
+
+	if (k > 0)
+	for (j = (k-1)+1; j < nblocks; j++)
+	{
+		if (rank == get_block_rank(k-1, j))
+			tag_array[ndeps++] = TAG12(k-1, j);
+	}
+#endif
+	
+	int source = get_block_rank(k, k);
+#ifdef SINGLE_TMP11
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
+	int mpi_tag = MPI_TAG11(k);
+	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
+	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
+
+//	fprintf(stderr, "NODE %d - 11 (%d) - recv when done ndeps %d - tag array %lx\n", rank, k, ndeps, tag_array[0]);
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_11(unsigned k, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 11_k is used to compute 12_kj + 12ki with i,j > k */
+	unsigned i;
+	for (i = k+1; i < nblocks; i++)
+	{
+		int r = get_block_rank(i, k);
+		rank_mask[r] = 1;
+	}
+
+	unsigned j;
+	for (j = k+1; j < nblocks; j++)
+	{
+		int r = get_block_rank(k, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_11_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_11(k, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, k);
+	starpu_tag_t tag = TAG11_SAVE(k);
+	int mpi_tag = MPI_TAG11(k);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_11_real(unsigned k)
+{
+	struct starpu_task *task = create_task(TAG11(k));
+
+	task->cl = &STARPU_PLU(cl11);
+
+	task->cl_arg = create_debug_info(k, k, k);
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = STARPU_PLU(get_block_handle)(k, k);
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->k = k;
+
+	task->callback_func = callback_task_11_real;
+	task->callback_arg = arg;
+
+	/* this is an important task */
+	if (!no_prio)
+		task->priority = STARPU_MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+	else {
+		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
+	}
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static void create_task_11(unsigned k)
+{
+	if (get_block_rank(k, k) == rank)
+	{
+#ifdef VERBOSE_INIT
+		fprintf(stderr, "CREATE real task 11(%d) (TAG11_SAVE(%d) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
+#endif
+		create_task_11_real(k);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_11(k, rank_mask);
+		
+		if (rank_mask[rank])
+		{
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "create RECV task 11(%d) on node %d\n", k, rank);
+#endif
+			create_task_11_recv(k);
+		}
+		else {
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "Node %d needs not 11(%d)\n", rank, k);
+#endif
+		}
+	}
+}
+
+
+
+/*
+ *	Task 12 (Update lower left (TRSM))
+ */
+
+static void create_task_12_recv(unsigned k, unsigned j)
+{
+	unsigned i;
+
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 12_(k-1)j can be used by 22_(k-1)ij with
+	 * i >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[nblocks];
+	
+#ifdef SINGLE_TMP1221
+	if (k > 0)
+	for (i = (k-1)+1; i < nblocks; i++)
+#else
+	if (k > 1)
+	for (i = (k-2)+1; i < nblocks; i++)
+#endif
+	{
+		if (rank == get_block_rank(i, j))
+#ifdef SINGLE_TMP1221
+			tag_array[ndeps++] = TAG22(k-1, i, j);
+#else
+			tag_array[ndeps++] = TAG22(k-2, i, j);
+#endif
+	}
+	
+	int source = get_block_rank(k, j);
+#ifdef SINGLE_TMP1221
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
+#else
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j,k);
+#endif
+	int mpi_tag = MPI_TAG12(k, j);
+	starpu_tag_t partial_tag = TAG12_SAVE_PARTIAL(k, j);
+	starpu_tag_t unlocked_tag = TAG12_SAVE(k, j);
+
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_12(unsigned k, unsigned j, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 12_kj is used to compute 22_kij with i > k */
+	unsigned i;
+	for (i = k+1; i < nblocks; i++)
+	{
+		int r = get_block_rank(i, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_12_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+	unsigned j = arg->j;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_12(k, j, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, j);
+	starpu_tag_t tag = TAG12_SAVE(k, j);
+	int mpi_tag = MPI_TAG12(k, j);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_12_real(unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG12(k, j));
+	
+#warning temporary fix :/
+//	task->cl = &STARPU_PLU(cl12);
+	task->cl = &STARPU_PLU(cl21);
+
+	task->cl_arg = create_debug_info(j, j, k);
+
+	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
+
+	starpu_tag_t tag_11_dep; 
+
+	/* which sub-data is manipulated ? */
+	starpu_data_handle_t diag_block;
+	if (diag_block_is_local)
+	{
+		diag_block = STARPU_PLU(get_block_handle)(k, k);
+		tag_11_dep = TAG11(k);
+	}
+	else 
+	{
+#ifdef SINGLE_TMP11
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
+		tag_11_dep = TAG11_SAVE(k);
+	}
+
+	task->handles[0] = diag_block; 
+	task->handles[1] = STARPU_PLU(get_block_handle)(k, j); 
+
+	STARPU_ASSERT(get_block_rank(k, j) == rank);
+
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->j = j;
+		arg->k = k;
+
+	task->callback_func = callback_task_12_real;
+	task->callback_arg = arg;
+
+	if (!no_prio && (j == k+1)) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG12(k, j), 2, tag_11_dep, TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG12(k, j), 1, tag_11_dep);
+	}
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static void create_task_12(unsigned k, unsigned j)
+{
+	if (get_block_rank(k, j) == rank)
+	{
+#ifdef VERBOSE_INIT
+		fprintf(stderr, "CREATE real task 12(k = %d, j = %d) on node %d\n", k, j, rank);
+#endif
+		create_task_12_real(k, j);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_12(k, j, rank_mask);
+		
+		if (rank_mask[rank])
+		{
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "create RECV task 12(k = %d, j = %d) on node %d\n", k, j, rank);
+#endif
+			create_task_12_recv(k, j);
+		}
+		else {
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "Node %d needs not 12(k=%d, i=%d)\n", rank, k, j);
+#endif
+		}
+	}
+}
+
+/*
+ *	Task 21 (Update upper right (TRSM))
+ */
+
+static void create_task_21_recv(unsigned k, unsigned i)
+{
+	unsigned j;
+
+	/* The current node is not computing that task, so we receive the block
+	 * with MPI */
+
+	/* We don't issue a MPI receive request until everyone using the
+	 * temporary buffer is done : 21_(k-1)i can be used by 22_(k-1)ij with
+	 * j >= k */
+	unsigned ndeps = 0;
+	starpu_tag_t tag_array[nblocks];
+	
+#ifdef SINGLE_TMP1221
+	if (k > 0)
+	for (j = (k-1)+1; j < nblocks; j++)
+#else
+	if (k > 1)
+	for (j = (k-2)+1; j < nblocks; j++)
+#endif
+	{
+		if (rank == get_block_rank(i, j))
+#ifdef SINGLE_TMP1221
+			tag_array[ndeps++] = TAG22(k-1, i, j);
+#else
+			tag_array[ndeps++] = TAG22(k-2, i, j);
+#endif
+	}
+
+	int source = get_block_rank(i, k);
+#ifdef SINGLE_TMP1221
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i);
+#else
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i, k);
+#endif
+	int mpi_tag = MPI_TAG21(k, i);
+	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
+	starpu_tag_t unlocked_tag = TAG21_SAVE(k, i);
+
+//	fprintf(stderr, "NODE %d - 21 (%d, %d) - recv when done ndeps %d - tag array %lx\n", rank, k, i, ndeps, tag_array[0]);
+	receive_when_deps_are_done(ndeps, tag_array, source, mpi_tag, block_handle, partial_tag, unlocked_tag);
+}
+
+static void find_nodes_using_21(unsigned k, unsigned i, int *rank_mask)
+{
+	memset(rank_mask, 0, world_size*sizeof(int));
+
+	/* Block 21_ki is used to compute 22_kij with j > k */
+	unsigned j;
+	for (j = k+1; j < nblocks; j++)
+	{
+		int r = get_block_rank(i, j);
+		rank_mask[r] = 1;
+	}
+}
+
+static void callback_task_21_real(void *_arg)
+{
+	struct callback_arg *arg = _arg;
+
+	unsigned k = arg->k;
+	unsigned i = arg->i;
+
+	/* Find all the nodes potentially requiring this block */
+	int rank_mask[world_size];
+	find_nodes_using_21(k, i, rank_mask);
+	rank_mask[rank] = 0;
+
+	/* Send the block to those nodes */
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(i, k);
+	starpu_tag_t tag = TAG21_SAVE(k, i);
+	int mpi_tag = MPI_TAG21(k, i);
+	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
+	
+	free(arg);
+}
+
+static void create_task_21_real(unsigned k, unsigned i)
+{
+	struct starpu_task *task = create_task(TAG21(k, i));
+
+#warning temporary fix 
+//	task->cl = &STARPU_PLU(cl21);
+	task->cl = &STARPU_PLU(cl12);
+
+	task->cl_arg = create_debug_info(i, i, k);
+
+	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
+
+	starpu_tag_t tag_11_dep; 
+	
+	/* which sub-data is manipulated ? */
+	starpu_data_handle_t diag_block;
+	if (diag_block_is_local)
+	{
+		diag_block = STARPU_PLU(get_block_handle)(k, k);
+		tag_11_dep = TAG11(k);
+	}
+	else 
+	{
+#ifdef SINGLE_TMP11
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
+		tag_11_dep = TAG11_SAVE(k);
+	}
+
+	task->handles[0] = diag_block; 
+	task->handles[1] = STARPU_PLU(get_block_handle)(i, k);
+
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
+
+	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
+		arg->i = i;
+		arg->k = k;
+
+	task->callback_func = callback_task_21_real;
+	task->callback_arg = arg;
+
+	if (!no_prio && (i == k+1)) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, i), 2, tag_11_dep, TAG22(k-1, i, k));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, i), 1, tag_11_dep);
+	}
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static void create_task_21(unsigned k, unsigned i)
+{
+	if (get_block_rank(i, k) == rank)
+	{
+#ifdef VERBOSE_INIT
+		fprintf(stderr, "CREATE real task 21(k = %d, i = %d) on node %d\n", k, i, rank);
+#endif
+		create_task_21_real(k, i);
+	}
+	else {
+		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
+		int rank_mask[world_size];
+		find_nodes_using_21(k, i, rank_mask);
+		
+		if (rank_mask[rank])
+		{
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "create RECV task 21(k = %d, i = %d) on node %d\n", k, i, rank);
+#endif
+			create_task_21_recv(k, i);
+		}
+		else {
+#ifdef VERBOSE_INIT
+			fprintf(stderr, "Node %d needs not 21(k=%d, i=%d)\n", rank, k,i);
+#endif
+		}
+	}
+}
+
+/*
+ *	Task 22 (GEMM)
+ */
+
+static void create_task_22_real(unsigned k, unsigned i, unsigned j)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &STARPU_PLU(cl22);
+
+	task->cl_arg = create_debug_info(i, j, k);
+
+	/* which sub-data is manipulated ? */
+
+	/* produced by TAG21_SAVE(k, i) */ 
+	unsigned block21_is_local = (get_block_rank(i, k) == rank);
+	starpu_tag_t tag_21_dep;
+
+	starpu_data_handle_t block21;
+	if (block21_is_local)
+	{
+		block21 = STARPU_PLU(get_block_handle)(i, k);
+		tag_21_dep = TAG21(k, i);
+	}
+	else 
+	{
+#ifdef SINGLE_TMP1221
+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
+#else
+		block21 = STARPU_PLU(get_tmp_21_block_handle)(i, k);
+#endif
+		tag_21_dep = TAG21_SAVE(k, i);
+	}
+
+	/* produced by TAG12_SAVE(k, j) */
+	unsigned block12_is_local = (get_block_rank(k, j) == rank);
+	starpu_tag_t tag_12_dep;
+
+	starpu_data_handle_t block12;
+	if (block12_is_local)
+	{
+	//	block12 = STARPU_PLU(get_block_handle)(j, k);
+		block12 = STARPU_PLU(get_block_handle)(k, j);
+		tag_12_dep = TAG12(k, j);
+	}
+	else 
+	{
+#ifdef SINGLE_TMP1221
+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
+#else
+		block12 = STARPU_PLU(get_tmp_12_block_handle)(j, k);
+#endif
+		tag_12_dep = TAG12_SAVE(k, j);
+	}
+
+
+
+#warning temporary fix :/
+	//task->handles[0] = block21;
+	task->handles[0] = block12;
+
+	//task->handles[1] = block12;
+	task->handles[1] = block21;
+
+	/* produced by TAG22(k-1, i, j) */
+	task->handles[2] = STARPU_PLU(get_block_handle)(i, j);
+
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
+
+	if (!no_prio && (i == k + 1) && (j == k +1) ) {
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), tag_12_dep, tag_21_dep);
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, tag_12_dep, tag_21_dep);
+	}
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static void create_task_22(unsigned k, unsigned i, unsigned j)
+{
+	if (get_block_rank(i, j) == rank)
+	{
+	//	fprintf(stderr, "CREATE real task 22(k = %d, i = %d, j = %d) on node %d\n", k, i, j, rank);
+		create_task_22_real(k, i, j);
+	}
+//	else {
+//		fprintf(stderr, "Node %d needs not 22(k=%d, i=%d, j = %d)\n", rank, k,i,j);
+//	}
+}
+
+static void wait_tag_and_fetch_handle(starpu_tag_t tag, starpu_data_handle_t handle)
+{
+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
+
+	starpu_tag_wait(tag);
+//	fprintf(stderr, "Rank %d : tag %lx is done\n", rank, tag);
+
+	starpu_data_acquire(handle, STARPU_R);
+
+//	starpu_data_unregister(handle);
+}
+
+static void wait_termination(void)
+{
+	unsigned k, i, j;
+	for (k = 0; k < nblocks; k++)
+	{
+		/* Wait task 11k if needed */
+		if (get_block_rank(k, k) == rank)
+		{
+			starpu_data_handle_t diag_block = STARPU_PLU(get_block_handle)(k, k);
+			wait_tag_and_fetch_handle(TAG11_SAVE(k), diag_block);
+		}
+		
+
+		for (i = k + 1; i < nblocks; i++)
+		{
+			/* Wait task 21ki if needed */
+			if (get_block_rank(i, k) == rank)
+			{
+				starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(i, k);
+				//starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(k, i);
+				//fprintf(stderr, "BLOCK21 i %d k %d -> handle %p\n", i, k, block21);
+				wait_tag_and_fetch_handle(TAG21_SAVE(k, i), block21);
+			}
+		}
+
+		for (j = k + 1; j < nblocks; j++)
+		{
+			/* Wait task 12kj if needed */
+			if (get_block_rank(k, j) == rank)
+			{
+				//starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(j, k);
+				starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(k, j);
+				//fprintf(stderr, "BLOCK12 j %d k %d -> handle %p\n", j, k, block12);
+				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
+			}
+		}
+	}	
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
+{
+	double start;
+	double end;
+
+	nblocks = _nblocks;
+	rank = _rank;
+	world_size = _world_size;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		create_task_11(k);
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(k, i);
+			create_task_21(k, i);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(k, i, j);
+			}
+		}
+	}
+
+	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
+	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
+
+	/* schedule the codelet */
+	start = starpu_timing_now();
+
+	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
+
+	wait_termination();
+	
+	end = starpu_timing_now();
+
+	double timing = end - start;
+	
+//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
+	
+	return timing;
+}

+ 68 - 0
nmad/examples/mpi_lu/pxlu.h

@@ -0,0 +1,68 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2012, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PXLU_H__
+#define __PXLU_H__
+
+#include <starpu.h>
+#include <common/blas.h>
+#include <starpu_mpi.h>
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#endif
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+//#define SINGLE_TMP11	1
+//#define SINGLE_TMP1221	1
+
+struct debug_info {
+	unsigned i;
+	unsigned j;
+	unsigned k;
+};
+
+double STARPU_PLU(plu_main)(unsigned nblocks, int rank, int world_size);
+
+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks);
+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved);
+
+unsigned STARPU_PLU(display_flag)(void);
+
+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j);
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j);
+#ifdef SINGLE_TMP11
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void);
+#else
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k);
+#endif
+#ifdef SINGLE_TMP1221
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
+#else
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k);
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k);
+#endif
+
+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize);
+
+int get_block_rank(unsigned i, unsigned j);
+
+#endif // __PXLU_H__

+ 442 - 0
nmad/examples/mpi_lu/pxlu_kernels.c

@@ -0,0 +1,442 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012  Université de Bordeaux
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pxlu.h"
+#include "pxlu_kernels.h"
+#include <math.h>
+
+///#define VERBOSE_KERNELS	1
+
+/*
+ * U22
+ */
+
+static inline void STARPU_PLU(common_u22)(void *descr[],
+				int s, STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	TYPE *right 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	TYPE *left 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+	TYPE *center 	= (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
+
+#ifdef VERBOSE_KERNELS
+	struct debug_info *info = _args;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d\n", rank, info->k, info->i, info->j);
+#endif
+
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	switch (s) {
+		case 0:
+			CPU_GEMM("N", "N", dy, dx, dz,
+				(TYPE)-1.0, right, ld21, left, ld12,
+				(TYPE)1.0, center, ld22);
+			break;
+
+#ifdef STARPU_USE_CUDA
+		case 1:
+			CUBLAS_GEMM('n', 'n', dx, dy, dz,
+				(TYPE)-1.0, right, ld21, left, ld12,
+				(TYPE)1.0f, center, ld22);
+
+			status = cublasGetError();
+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
+				STARPU_CUBLAS_REPORT_ERROR(status);
+
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+#ifdef VERBOSE_KERNELS
+	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d done\n", rank, info->k, info->i, info->j);
+#endif
+}
+
+static void STARPU_PLU(cpu_u22)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u22)(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u22)(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA
+
+static struct starpu_perfmodel STARPU_PLU(model_22) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_22_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_22)
+#endif
+};
+
+struct starpu_codelet STARPU_PLU(cl22) = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {STARPU_PLU(cpu_u22)},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {STARPU_PLU(cublas_u22)},
+#endif
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &STARPU_PLU(model_22)
+};
+
+
+/*
+ * U12
+ */
+
+static inline void STARPU_PLU(common_u12)(void *descr[],
+				int s, STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	TYPE *sub11;
+	TYPE *sub12;
+
+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
+	unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
+
+#ifdef VERBOSE_KERNELS
+	struct debug_info *info = _args;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#warning fixed debugging according to other tweak
+	//fprintf(stderr, "KERNEL 12 %d - k = %d i %d\n", rank, info->k, info->i);
+	fprintf(stderr, "KERNEL 21 %d - k = %d i %d\n", rank, info->k, info->j);
+
+	//fprintf(stderr, "INPUT 12 U11\n");
+	fprintf(stderr, "INPUT 21 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx12);
+	//fprintf(stderr, "INPUT 12 U12\n");
+	fprintf(stderr, "INPUT 21 U21\n");
+	STARPU_PLU(display_data_content)(sub12, nx12);
+#endif
+
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	/* solve L11 U12 = A12 (find U12) */
+	switch (s) {
+		case 0:
+			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
+					(TYPE)1.0, sub11, ld11, sub12, ld12);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
+					(TYPE)1.0, sub11, ld11, sub12, ld12);
+
+			status = cublasGetError();
+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
+				STARPU_CUBLAS_REPORT_ERROR(status);
+
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+
+#ifdef VERBOSE_KERNELS
+	//fprintf(stderr, "OUTPUT 12 U12\n");
+	fprintf(stderr, "OUTPUT 21 U21\n");
+	STARPU_PLU(display_data_content)(sub12, nx12);
+#endif
+}
+
+static void STARPU_PLU(cpu_u12)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u12)(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u12)(descr, 1, _args);
+}
+#endif // STARPU_USE_CUDA
+
+static struct starpu_perfmodel STARPU_PLU(model_12) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_12_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_12)
+#endif
+};
+
+struct starpu_codelet STARPU_PLU(cl12) = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {STARPU_PLU(cpu_u12)},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {STARPU_PLU(cublas_u12)},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &STARPU_PLU(model_12)
+};
+
+
+/*
+ * U21
+ */
+
+static inline void STARPU_PLU(common_u21)(void *descr[],
+				int s, STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	TYPE *sub11;
+	TYPE *sub21;
+
+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+	sub21 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
+	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
+
+#ifdef VERBOSE_KERNELS
+	struct debug_info *info = _args;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#warning fixed debugging according to other tweak
+	//fprintf(stderr, "KERNEL 21 %d (k = %d, i = %d)\n", rank, info->k, info->i);
+	fprintf(stderr, "KERNEL 12 %d (k = %d, j = %d)\n", rank, info->k, info->j);
+
+	//fprintf(stderr, "INPUT 21 U11\n");
+	fprintf(stderr, "INPUT 12 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx21);
+	//fprintf(stderr, "INPUT 21 U21\n");
+	fprintf(stderr, "INPUT 12 U12\n");
+	STARPU_PLU(display_data_content)(sub21, nx21);
+#endif
+
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+#endif
+
+
+	switch (s) {
+		case 0:
+			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
+					(TYPE)1.0, sub11, ld11, sub21, ld21);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
+					(TYPE)1.0, sub11, ld11, sub21, ld21);
+
+			status = cublasGetError();
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_CUBLAS_REPORT_ERROR(status);
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+
+#ifdef VERBOSE_KERNELS
+	//fprintf(stderr, "OUTPUT 21 U11\n");
+	fprintf(stderr, "OUTPUT 12 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx21);
+	//fprintf(stderr, "OUTPUT 21 U21\n");
+	fprintf(stderr, "OUTPUT 12 U12\n");
+	STARPU_PLU(display_data_content)(sub21, nx21);
+#endif
+}
+
+static void STARPU_PLU(cpu_u21)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u21)(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u21)(descr, 1, _args);
+}
+#endif
+
+static struct starpu_perfmodel STARPU_PLU(model_21) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_21_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_21)
+#endif
+};
+
+struct starpu_codelet STARPU_PLU(cl21) = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {STARPU_PLU(cpu_u21)},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {STARPU_PLU(cublas_u21)},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &STARPU_PLU(model_21)
+};
+
+
+/*
+ *	U11
+ */
+
+static inline void STARPU_PLU(common_u11)(void *descr[],
+				int s, STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	TYPE *sub11;
+
+	sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
+
+	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
+	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
+
+	unsigned long z;
+
+#ifdef VERBOSE_KERNELS
+	struct debug_info *info = _args;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
+#endif
+
+	switch (s) {
+		case 0:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				pivot = sub11[z+z*ld];
+				STARPU_ASSERT(pivot != 0.0);
+
+				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
+
+				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+				STARPU_ASSERT(pivot != 0.0);
+
+				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+
+				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+#ifdef VERBOSE_KERNELS
+	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
+#endif
+}
+
+static void STARPU_PLU(cpu_u11)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u11)(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
+{
+	STARPU_PLU(common_u11)(descr, 1, _args);
+}
+#endif// STARPU_USE_CUDA
+
+static struct starpu_perfmodel STARPU_PLU(model_11) = {
+	.type = STARPU_HISTORY_BASED,
+#ifdef STARPU_ATLAS
+	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
+#elif defined(STARPU_GOTO)
+	.symbol = STARPU_PLU_STR(lu_model_11_goto)
+#else
+	.symbol = STARPU_PLU_STR(lu_model_11)
+#endif
+};
+
+struct starpu_codelet STARPU_PLU(cl11) = {
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {STARPU_PLU(cpu_u11)},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {STARPU_PLU(cublas_u11)},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &STARPU_PLU(model_11)
+};

+ 32 - 0
nmad/examples/mpi_lu/pxlu_kernels.h

@@ -0,0 +1,32 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PXLU_KERNELS_H__
+#define __PXLU_KERNELS_H__
+
+#include <starpu.h>
+
+#define str(s) #s
+#define xstr(s)        str(s)
+#define STARPU_PLU_STR(name)  xstr(STARPU_PLU(name))
+
+struct starpu_codelet STARPU_PLU(cl11);
+struct starpu_codelet STARPU_PLU(cl12);
+struct starpu_codelet STARPU_PLU(cl21);
+struct starpu_codelet STARPU_PLU(cl22);
+
+#endif // __PXLU_KERNELS_H__

+ 19 - 0
nmad/examples/mpi_lu/slu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "xlu_kernels.c"

+ 106 - 0
nmad/examples/perf.sh

@@ -0,0 +1,106 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2010  Université de Bordeaux
+# Copyright (C) 2010  Centre National de la Recherche Scientifique
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# 4G x np = 4 * (k*1K) ^ 2
+# A G * np = 4 * k^2 * 1M
+# A * 250 * np = k^2
+# A = 6
+# k = sqrt(1500*np)
+# np = 1 => k = 32
+# np = 2 => k = 48
+# np = 3 => k = 64 
+# np = 4 => k = 64
+
+# Problem size
+NBLOCKS=16
+BLOCKSIZE=1024
+SIZE=$(($NBLOCKS*$BLOCKSIZE))
+
+echo "JOB ID ${PBS_JOBID}"
+
+nnodes=$(cat machinefile.${PBS_JOBID}|wc -l)
+echo "got $nnodes mpi nodes"
+
+# Calibrate
+ncalibrate=0
+for i in `seq 1 $ncalibrate`
+do
+echo "STARPU_CALIBRATE $i/$ncalibrate"
+STARPU_CALIBRATE=1 STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $nnodes ./mpi_lu/plu_example_float -p 2 -q 2 -nblocks 32 -size $((32*$BLOCKSIZE)) -numa
+done
+
+func()
+{
+ngpus=$1
+np=$2
+p=$3
+q=$4
+nblocks=$5
+
+echo "*******************************************"> log
+echo "*************** NGPUS $ngpus - np $np - nblocks $nblocks **************">> log
+echo "*******************************************">> log
+cat log
+cat log >> log.all
+
+STARPU_NCPUS=0 STARPU_NCUDA=$ngpus STARPU_SCHED="dmda" STARPU_PREFETCH=1 mpirun -machinefile machinefile.${PBS_JOBID} -np $np ./mpi_lu/plu_example_float -p $p -q $q -nblocks $nblocks -size $(($nblocks * $BLOCKSIZE)) -numa > log.out 2> log.err
+cat log.out > log
+cat log.err >> log
+cat log
+cat log >> log.all
+}
+
+rm -f log.all
+
+#how many time do we repeat each experiment ?
+nloops=3
+
+per_node_max_memory=7000
+
+for np in 1 2 4
+do
+	for nblocks in 16 32 48 64 80
+	do
+		for ngpus_per_node in 1 2 3 4
+		do
+			for loop in `seq 1 $nloops`
+			do
+				# Compute p and q from np
+				case $np in
+				  1) p=1; q=1;;
+				  2) p=2; q=1;;
+				  4) p=2; q=2;;
+				  *) echo -n "does not support $np nodes yet";;
+				esac
+
+				# Does the problem fit into memory ?
+				matrix_size=$(($nblocks * $BLOCKSIZE))
+				per_node_memory=$(($((4*$matrix_size*$matrix_size/(1024*1024))) / $np))
+
+				echo "NP $np P $p Q $q SIZE $per_node_memory NBLOCKS $nblocks"
+
+				if test $per_node_memory -ge $per_node_max_memory; then
+						echo "Problem is too large !"
+				else
+					func $ngpus_per_node $np $p $q $nblocks
+					echo "go !"
+				fi
+			done
+		done
+	done
+done

+ 258 - 0
nmad/examples/stencil/stencil5.c

@@ -0,0 +1,258 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015              Université Bordeaux
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+#define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
+    						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
+                                                fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
+                                                fflush(ofile); }} while(0);
+
+void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	float *xy = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	float *xm1y = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	float *xp1y = (float *)STARPU_VARIABLE_GET_PTR(descr[2]);
+	float *xym1 = (float *)STARPU_VARIABLE_GET_PTR(descr[3]);
+	float *xyp1 = (float *)STARPU_VARIABLE_GET_PTR(descr[4]);
+
+//	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
+//	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
+}
+
+struct starpu_codelet stencil5_cl =
+{
+	.cpu_funcs = {stencil5_cpu},
+	.nbuffers = 5,
+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
+};
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER_DEF	100
+#  define X         	5
+#  define Y         	5
+#else
+#  define NITER_DEF	100
+#  define X         	20
+#  define Y         	20
+#endif
+
+int display = 0;
+int niter = NITER_DEF;
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+	/* Block distrib */
+	return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
+}
+
+/* Shifted distribution, for migration example */
+int my_distrib2(int x, int y, int nb_nodes)
+{
+	return (my_distrib(x, y, nb_nodes) + 1) % nb_nodes;
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-iter") == 0)
+		{
+			char *argptr;
+			niter = strtol(argv[++i], &argptr, 10);
+		}
+		if (strcmp(argv[i], "-display") == 0)
+		{
+			display = 1;
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int my_rank, size, x, y, loop;
+	float mean=0;
+	float matrix[X][Y];
+	starpu_data_handle_t data_handles[X][Y];
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_mpi_init(&argc, &argv, 1);
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	parse_args(argc, argv);
+
+	/* Initial data values */
+	starpu_srand48((long int)time(NULL));
+	for(x = 0; x < X; x++)
+	{
+		for (y = 0; y < Y; y++)
+		{
+			matrix[x][y] = (float)starpu_drand48();
+			mean += matrix[x][y];
+		}
+	}
+	mean /= (X*Y);
+
+	if (display)
+	{
+		FPRINTF_MPI(stdout, "mean=%2.2f\n", mean);
+		for(x = 0; x < X; x++)
+		{
+			fprintf(stdout, "[%d] ", my_rank);
+			for (y = 0; y < Y; y++)
+			{
+				fprintf(stdout, "%2.2f ", matrix[x][y]);
+			}
+			fprintf(stdout, "\n");
+		}
+	}
+
+	/* Initial distribution */
+	for(x = 0; x < X; x++)
+	{
+		for (y = 0; y < Y; y++)
+		{
+			int mpi_rank = my_distrib(x, y, size);
+			if (mpi_rank == my_rank)
+			{
+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
+				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(float));
+			}
+			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
+				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
+			{
+				/* I don't own that index, but will need it for my computations */
+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
+			}
+			else
+			{
+				/* I know it's useless to allocate anything for this */
+				data_handles[x][y] = NULL;
+			}
+			if (data_handles[x][y])
+			{
+				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
+			}
+		}
+	}
+
+	/* First computation with initial distribution */
+	for(loop=0 ; loop<niter; loop++)
+	{
+		for (x = 1; x < X-1; x++)
+		{
+			for (y = 1; y < Y-1; y++)
+			{
+				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
+						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
+						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
+						       0);
+			}
+		}
+	}
+	fprintf(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	/* Now migrate data to a new distribution */
+
+	/* First register newly needed data */
+	for(x = 0; x < X; x++)
+	{
+		for (y = 0; y < Y; y++)
+		{
+			int mpi_rank = my_distrib2(x, y, size);
+			if (!data_handles[x][y] && (mpi_rank == my_rank
+				 || my_rank == my_distrib2(x+1, y, size) || my_rank == my_distrib2(x-1, y, size)
+				 || my_rank == my_distrib2(x, y+1, size) || my_rank == my_distrib2(x, y-1, size)))
+			{
+				/* Register newly-needed data */
+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
+				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
+			}
+			if (data_handles[x][y] && mpi_rank != starpu_mpi_data_get_rank(data_handles[x][y]))
+			{
+				/* Migrate the data */
+				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
+				/* And register new rank of the matrix */
+				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
+			}
+		}
+	}
+
+	/* Second computation with new distribution */
+	for(loop=0 ; loop<niter; loop++)
+	{
+		for (x = 1; x < X-1; x++)
+		{
+			for (y = 1; y < Y-1; y++)
+			{
+				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
+						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
+						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
+						       0);
+			}
+		}
+	}
+	fprintf(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+
+	/* Unregister data */
+	for(x = 0; x < X; x++)
+	{
+		for (y = 0; y < Y; y++)
+		{
+			if (data_handles[x][y])
+			{
+				int mpi_rank = my_distrib(x, y, size);
+				/* Get back data to original place where the user-provided buffer is. */
+				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
+				/* Register original rank of the matrix (although useless) */
+				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
+				/* And unregister it */
+				starpu_data_unregister(data_handles[x][y]);
+			}
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	if (display)
+	{
+		fprintf(stdout, "[%d] mean=%2.2f\n", my_rank, mean);
+		for(x = 0; x < X; x++)
+		{
+			fprintf(stdout, "[%d] ", my_rank);
+			for (y = 0; y < Y; y++)
+			{
+				fprintf(stdout, "%2.2f ", matrix[x][y]);
+			}
+			fprintf(stdout, "\n");
+		}
+	}
+
+	return 0;
+}

+ 134 - 0
nmad/include/starpu_mpi.h

@@ -0,0 +1,134 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012, 2014-2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_H__
+#define __STARPU_MPI_H__
+
+#include <starpu.h>
+
+#if defined(STARPU_USE_MPI)
+
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef void *starpu_mpi_req;
+
+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
+int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
+int starpu_mpi_barrier(MPI_Comm comm);
+
+int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency);
+
+int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm);
+int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi);
+int starpu_mpi_initialize(void) STARPU_DEPRECATED;
+int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
+int starpu_mpi_shutdown(void);
+
+struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+/* the function starpu_mpi_insert_task has the same semantics as starpu_mpi_task_insert, it is kept to avoid breaking old codes */
+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+
+void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
+void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
+
+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
+
+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
+
+void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
+
+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
+void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
+
+int starpu_mpi_cached_receive(starpu_data_handle_t data_handle);
+int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest);
+
+int starpu_mpi_comm_size(MPI_Comm comm, int *size);
+int starpu_mpi_comm_rank(MPI_Comm comm, int *rank);
+int starpu_mpi_world_rank(void);
+int starpu_mpi_world_size(void);
+
+int starpu_mpi_get_communication_tag(void);
+void starpu_mpi_set_communication_tag(int tag);
+
+void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, int tag, int rank, MPI_Comm comm);
+#define starpu_mpi_data_register(data_handle, tag, rank) starpu_mpi_data_register_comm(data_handle, tag, rank, MPI_COMM_WORLD)
+
+void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm);
+#define starpu_mpi_data_set_rank(handle, rank) starpu_mpi_data_set_rank_comm(handle, rank, MPI_COMM_WORLD)
+void starpu_mpi_data_set_tag(starpu_data_handle_t handle, int tag);
+#define starpu_data_set_rank starpu_mpi_data_set_rank
+#define starpu_data_set_tag starpu_mpi_data_set_tag
+
+int starpu_mpi_data_get_rank(starpu_data_handle_t handle);
+int starpu_mpi_data_get_tag(starpu_data_handle_t handle);
+#define starpu_data_get_rank starpu_mpi_data_get_rank
+#define starpu_data_get_tag starpu_mpi_data_get_tag
+
+void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new_rank);
+
+#define STARPU_MPI_NODE_SELECTION_CURRENT_POLICY -1
+#define STARPU_MPI_NODE_SELECTION_MOST_R_DATA    0
+
+typedef int (*starpu_mpi_select_node_policy_func_t)(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data);
+int starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_policy_func_t policy_func);
+int starpu_mpi_node_selection_unregister_policy(int policy);
+
+int starpu_mpi_node_selection_get_current_policy();
+int starpu_mpi_node_selection_set_current_policy(int policy);
+
+int starpu_mpi_cache_is_enabled();
+int starpu_mpi_cache_set(int enabled);
+
+int starpu_mpi_wait_for_all(MPI_Comm comm);
+
+typedef void (*starpu_mpi_datatype_allocate_func_t)(starpu_data_handle_t, MPI_Datatype *);
+typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
+int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
+int starpu_mpi_datatype_unregister(starpu_data_handle_t handle);
+
+int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *));
+int starpu_mpi_pre_submit_hook_unregister();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // STARPU_USE_MPI
+#endif // __STARPU_MPI_H__

+ 29 - 0
nmad/libstarpumpi.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011, 2016  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012  CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -DSTARPU_USE_DEPRECATED_API
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
+Requires: libstarpu
+Requires.private:

+ 58 - 0
nmad/src/Makefile.am

@@ -0,0 +1,58 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+CC=$(MPICC)
+CCLD=$(MPICC)
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
+
+lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
+  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE) \
+  $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
+noinst_HEADERS =					\
+	starpu_mpi_private.h				\
+	starpu_mpi_fxt.h				\
+	starpu_mpi_stats.h				\
+	starpu_mpi_datatype.h				\
+	starpu_mpi_cache.h				\
+	starpu_mpi_cache_stats.h			\
+	starpu_mpi_collective.c				\
+	starpu_mpi_select_node.h
+
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
+	starpu_mpi.c					\
+	starpu_mpi_helper.c				\
+	starpu_mpi_datatype.c				\
+	starpu_mpi_task_insert.c			\
+	starpu_mpi_collective.c				\
+	starpu_mpi_stats.c				\
+	starpu_mpi_private.c				\
+	starpu_mpi_cache.c				\
+	starpu_mpi_select_node.c			\
+	starpu_mpi_cache_stats.c
+
+showcheck:
+	-cat /dev/null

Файловите разлики са ограничени, защото са твърде много
+ 1253 - 0
nmad/src/starpu_mpi.c


+ 292 - 0
nmad/src/starpu_mpi_cache.c

@@ -0,0 +1,292 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011-2014  Université de Bordeaux
+ * Copyright (C) 2014 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/uthash.h>
+#include <datawizard/coherency.h>
+
+#include <starpu_mpi_cache.h>
+#include <starpu_mpi_cache_stats.h>
+#include <starpu_mpi_private.h>
+
+/* Whether we are allowed to keep copies of remote data. */
+struct _starpu_data_entry
+{
+	UT_hash_handle hh;
+	void *data;
+};
+
+static struct _starpu_data_entry **_cache_sent_data = NULL;
+static struct _starpu_data_entry **_cache_received_data = NULL;
+int _starpu_cache_enabled=1;
+
+int starpu_mpi_cache_is_enabled()
+{
+	return _starpu_cache_enabled==1;
+}
+
+int starpu_mpi_cache_set(int enabled)
+{
+	if (enabled == 1)
+	{
+		_starpu_cache_enabled = 1;
+	}
+	else
+	{
+		if (_starpu_cache_enabled)
+		{
+			// We need to clean the cache
+			int world_size;
+			starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+			MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+			_starpu_mpi_cache_free(world_size);
+		}
+		_starpu_cache_enabled = 0;
+	}
+	return 0;
+}
+
+void _starpu_mpi_cache_init(MPI_Comm comm)
+{
+	int nb_nodes;
+	int i;
+
+	_starpu_cache_enabled = starpu_get_env_number("STARPU_MPI_CACHE");
+	if (_starpu_cache_enabled == -1)
+	{
+		_starpu_cache_enabled = 1;
+	}
+
+	if (_starpu_cache_enabled == 0)
+	{
+		if (!_starpu_silent) fprintf(stderr,"Warning: StarPU MPI Communication cache is disabled\n");
+		return;
+	}
+
+	MPI_Comm_size(comm, &nb_nodes);
+	_STARPU_MPI_DEBUG(2, "Initialising htable for cache\n");
+	_cache_sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
+	for(i=0 ; i<nb_nodes ; i++) _cache_sent_data[i] = NULL;
+	_cache_received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
+	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
+	_starpu_mpi_cache_stats_init(comm);
+}
+
+static
+void _starpu_mpi_cache_empty_tables(int world_size)
+{
+	int i;
+
+	if (_starpu_cache_enabled == 0) return;
+
+	_STARPU_MPI_DEBUG(2, "Clearing htable for cache\n");
+
+	for(i=0 ; i<world_size ; i++)
+	{
+		struct _starpu_data_entry *entry, *tmp;
+		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
+		{
+			HASH_DEL(_cache_sent_data[i], entry);
+			free(entry);
+		}
+		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
+		{
+			HASH_DEL(_cache_received_data[i], entry);
+			_starpu_mpi_cache_stats_dec(i, (starpu_data_handle_t) entry->data);
+			free(entry);
+		}
+	}
+}
+
+void _starpu_mpi_cache_free(int world_size)
+{
+	if (_starpu_cache_enabled == 0) return;
+
+	_starpu_mpi_cache_empty_tables(world_size);
+	free(_cache_sent_data);
+	free(_cache_received_data);
+	_starpu_mpi_cache_stats_free();
+}
+
+void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data)
+{
+	int n, size;
+	MPI_Comm comm = ((struct _starpu_mpi_data *) data->mpi_data)->comm;
+
+	MPI_Comm_size(comm, &size);
+
+	for(n=0 ; n<size ; n++)
+	{
+		struct _starpu_data_entry *already_sent;
+		HASH_FIND_PTR(_cache_sent_data[n], &data, already_sent);
+		if (already_sent)
+		{
+			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data);
+			HASH_DEL(_cache_sent_data[n], already_sent);
+			free(already_sent);
+		}
+	}
+}
+
+void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data)
+{
+	int mpi_rank = starpu_mpi_data_get_rank(data);
+	struct _starpu_data_entry *already_received;
+
+	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
+	if (already_received)
+	{
+#ifdef STARPU_DEVEL
+#  warning TODO: Somebody else will write to the data, so discard our cached copy if any. starpu_mpi could just remember itself.
+#endif
+		_STARPU_MPI_DEBUG(2, "Clearing receive cache for data %p\n", data);
+		HASH_DEL(_cache_received_data[mpi_rank], already_received);
+		_starpu_mpi_cache_stats_dec(mpi_rank, data);
+		free(already_received);
+		starpu_data_invalidate_submit(data);
+	}
+}
+
+void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
+{
+	int nb_nodes, i;
+	int mpi_rank, my_rank;
+
+	if (_starpu_cache_enabled == 0) return;
+
+	MPI_Comm_size(comm, &nb_nodes);
+	MPI_Comm_rank(comm, &my_rank);
+
+	for(i=0 ; i<nb_nodes ; i++)
+	{
+		struct _starpu_data_entry *entry, *tmp;
+		HASH_ITER(hh, _cache_sent_data[i], entry, tmp)
+		{
+			mpi_rank = starpu_mpi_data_get_rank((starpu_data_handle_t) entry->data);
+			if (mpi_rank != my_rank && mpi_rank != -1)
+				starpu_data_invalidate_submit((starpu_data_handle_t) entry->data);
+			HASH_DEL(_cache_sent_data[i], entry);
+			free(entry);
+		}
+		HASH_ITER(hh, _cache_received_data[i], entry, tmp)
+		{
+			mpi_rank = starpu_mpi_data_get_rank((starpu_data_handle_t) entry->data);
+			if (mpi_rank != my_rank && mpi_rank != -1)
+				starpu_data_invalidate_submit((starpu_data_handle_t) entry->data);
+			HASH_DEL(_cache_received_data[i], entry);
+			_starpu_mpi_cache_stats_dec(i, (starpu_data_handle_t) entry->data);
+			free(entry);
+		}
+	}
+}
+
+void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
+{
+	struct _starpu_data_entry *avail;
+	int i, my_rank, nb_nodes;
+	int mpi_rank;
+	MPI_Comm comm = ((struct _starpu_mpi_data *) data_handle->mpi_data)->comm;
+
+	if (_starpu_cache_enabled == 0) return;
+
+	MPI_Comm_size(comm, &nb_nodes);
+	MPI_Comm_rank(comm, &my_rank);
+	mpi_rank = starpu_mpi_data_get_rank(data_handle);
+
+	for(i=0 ; i<nb_nodes ; i++)
+	{
+		HASH_FIND_PTR(_cache_sent_data[i], &data_handle, avail);
+		if (avail)
+		{
+			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
+			HASH_DEL(_cache_sent_data[i], avail);
+			free(avail);
+		}
+		HASH_FIND_PTR(_cache_received_data[i], &data_handle, avail);
+		if (avail)
+		{
+			_STARPU_MPI_DEBUG(2, "Clearing send cache for data %p\n", data_handle);
+			HASH_DEL(_cache_received_data[i], avail);
+			_starpu_mpi_cache_stats_dec(i, data_handle);
+			free(avail);
+		}
+	}
+}
+
+void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	int my_rank, mpi_rank;
+	_starpu_mpi_cache_flush( data_handle);
+
+	MPI_Comm_rank(comm, &my_rank);
+	mpi_rank = starpu_mpi_data_get_rank(data_handle);
+	if (mpi_rank != my_rank && mpi_rank != -1)
+		starpu_data_invalidate_submit(data_handle);
+}
+
+void *_starpu_mpi_cache_received_data_set(starpu_data_handle_t data)
+{
+	int mpi_rank = starpu_mpi_data_get_rank(data);
+	if (_starpu_cache_enabled == 0) return NULL;
+
+	struct _starpu_data_entry *already_received;
+	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
+	if (already_received == NULL)
+	{
+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
+		entry->data = data;
+		HASH_ADD_PTR(_cache_received_data[mpi_rank], data, entry);
+		_starpu_mpi_cache_stats_inc(mpi_rank, data);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(2, "Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
+	}
+	return already_received;
+}
+
+void *_starpu_mpi_cache_received_data_get(starpu_data_handle_t data)
+{
+	int mpi_rank = starpu_mpi_data_get_rank(data);
+	struct _starpu_data_entry *already_received;
+
+	if (_starpu_cache_enabled == 0) return NULL;
+	HASH_FIND_PTR(_cache_received_data[mpi_rank], &data, already_received);
+	return already_received;
+}
+
+void *_starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest)
+{
+	if (_starpu_cache_enabled == 0) return NULL;
+
+	struct _starpu_data_entry *already_sent;
+	HASH_FIND_PTR(_cache_sent_data[dest], &data, already_sent);
+	if (already_sent == NULL)
+	{
+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
+		entry->data = data;
+		HASH_ADD_PTR(_cache_sent_data[dest], data, entry);
+		_STARPU_MPI_DEBUG(2, "Noting that data %p has already been sent to %d\n", data, dest);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(2, "Do not send data %p to node %d as it has already been sent\n", data, dest);
+	}
+	return already_sent;
+}
+

+ 55 - 0
nmad/src/starpu_mpi_cache.h

@@ -0,0 +1,55 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011-2014  Université de Bordeaux
+ * Copyright (C) 2014 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_CACHE_H__
+#define __STARPU_MPI_CACHE_H__
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int _starpu_cache_enabled;
+void _starpu_mpi_cache_init(MPI_Comm comm);
+void _starpu_mpi_cache_free(int world_size);
+
+/*
+ * If the data is already available in the cache, return a pointer to the data
+ * If the data is NOT available in the cache, add it to the cache and return NULL
+ */
+void *_starpu_mpi_cache_received_data_set(starpu_data_handle_t data);
+void *_starpu_mpi_cache_received_data_get(starpu_data_handle_t data);
+void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data);
+
+/*
+ * If the data is already available in the cache, return a pointer to the data
+ * If the data is NOT available in the cache, add it to the cache and return NULL
+ */
+void *_starpu_mpi_cache_sent_data_set(starpu_data_handle_t data, int dest);
+void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data);
+
+void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_CACHE_H__

+ 69 - 0
nmad/src/starpu_mpi_cache_stats.c

@@ -0,0 +1,69 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi_cache_stats.h>
+#include <common/config.h>
+#include <stdio.h>
+#include <starpu_mpi_private.h>
+
+/* measure the amount of data transfers between each pair of MPI nodes */
+static size_t *comm_cache_amount;
+static int world_size;
+static int stats_enabled=0;
+
+void _starpu_mpi_cache_stats_init(MPI_Comm comm)
+{
+	stats_enabled = starpu_get_env_number("STARPU_MPI_CACHE_STATS");
+	if (stats_enabled == -1)
+	{
+		stats_enabled = 0;
+	}
+	if (stats_enabled == 0) return;
+
+	if (!_starpu_silent) fprintf(stderr,"Warning: StarPU is executed with STARPU_MPI_CACHE_STATS=1, which slows down a bit\n");
+
+	MPI_Comm_size(comm, &world_size);
+	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
+
+	comm_cache_amount = (size_t *) calloc(world_size, sizeof(size_t));
+}
+
+void _starpu_mpi_cache_stats_free()
+{
+	if (stats_enabled == 0) return;
+	free(comm_cache_amount);
+}
+
+void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count)
+{
+	size_t size;
+
+	if (stats_enabled == 0) return;
+
+	size = starpu_data_get_size(data_handle);
+
+	if (count == 1)
+	{
+		_STARPU_MPI_MSG("[communication cache] + %10ld to   %d\n", (long)size, dst);
+	}
+	else // count == -1
+	{
+		_STARPU_MPI_MSG("[communication cache] - %10ld from %d\n", (long)size, dst);
+	}
+
+	comm_cache_amount[dst] += count * size;
+}
+

+ 40 - 0
nmad/src/starpu_mpi_cache_stats.h

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_CACHE_STATS_H__
+#define __STARPU_MPI_CACHE_STATS_H__
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void _starpu_mpi_cache_stats_init(MPI_Comm comm);
+void _starpu_mpi_cache_stats_free();
+
+void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count);
+
+#define _starpu_mpi_cache_stats_inc(dst, data_handle) _starpu_mpi_cache_stats_update(dst, data_handle, +1)
+#define _starpu_mpi_cache_stats_dec(dst, data_handle) _starpu_mpi_cache_stats_update(dst, data_handle, -1)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_CACHE_STATS_H__

+ 162 - 0
nmad/src/starpu_mpi_collective.c

@@ -0,0 +1,162 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <mpi.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <starpu_mpi_private.h>
+
+struct _callback_arg
+{
+	void (*callback)(void *);
+	void *arg;
+	int nb;
+	int count;
+};
+
+void _callback_collective(void *arg)
+{
+	struct _callback_arg *callback_arg = arg;
+	callback_arg->nb ++;
+	if (callback_arg->nb == callback_arg->count)
+	{
+		callback_arg->callback(callback_arg->arg);
+		free(callback_arg);
+	}
+}
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
+{
+	int rank;
+	int x;
+	struct _callback_arg *callback_arg = NULL;
+	void (*callback_func)(void *) = NULL;
+	void (*callback)(void *);
+
+	MPI_Comm_rank(comm, &rank);
+
+	callback = (rank == root) ? scallback : rcallback;
+	if (callback)
+	{
+		callback_func = _callback_collective;
+		callback_arg = malloc(sizeof(struct _callback_arg));
+		callback_arg->count = 0;
+		callback_arg->nb = 0;
+		callback_arg->callback = (rank == root) ? scallback : rcallback;
+		callback_arg->arg = (rank == root) ? sarg : rarg;
+
+		for(x = 0; x < count ; x++)
+		{
+			if (data_handles[x])
+			{
+				int owner = starpu_mpi_data_get_rank(data_handles[x]);
+				int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
+				STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
+				if ((rank == root) && (owner != root))
+				{
+					callback_arg->count ++;
+				}
+				if ((rank != root) && (owner == rank))
+				{
+					callback_arg->count ++;
+				}
+			}
+		}
+	}
+
+	for(x = 0; x < count ; x++)
+	{
+		if (data_handles[x])
+		{
+			int owner = starpu_mpi_data_get_rank(data_handles[x]);
+			int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
+			STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
+			if ((rank == root) && (owner != root))
+			{
+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, owner);
+				starpu_mpi_isend_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
+			}
+			if ((rank != root) && (owner == rank))
+			{
+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, root);
+				starpu_mpi_irecv_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
+			}
+		}
+	}
+	return 0;
+}
+
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
+{
+	int rank;
+	int x;
+	struct _callback_arg *callback_arg = NULL;
+	void (*callback_func)(void *) = NULL;
+	void (*callback)(void *);
+
+	MPI_Comm_rank(comm, &rank);
+
+	callback = (rank == root) ? scallback : rcallback;
+	if (callback)
+	{
+		callback_func = _callback_collective;
+
+		callback_arg = malloc(sizeof(struct _callback_arg));
+		callback_arg->count = 0;
+		callback_arg->nb = 0;
+		callback_arg->callback = callback;
+		callback_arg->arg = (rank == root) ? sarg : rarg;
+
+		for(x = 0; x < count ; x++)
+		{
+			if (data_handles[x])
+			{
+				int owner = starpu_mpi_data_get_rank(data_handles[x]);
+				int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
+				STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
+				if ((rank == root) && (owner != root))
+				{
+					callback_arg->count ++;
+				}
+				if ((rank != root) && (owner == rank))
+				{
+					callback_arg->count ++;
+				}
+			}
+		}
+	}
+
+	for(x = 0; x < count ; x++)
+	{
+		if (data_handles[x])
+		{
+			int owner = starpu_mpi_data_get_rank(data_handles[x]);
+			int mpi_tag = starpu_mpi_data_get_tag(data_handles[x]);
+			STARPU_ASSERT_MSG(mpi_tag >= 0, "Invalid tag for data handle");
+			if ((rank == root) && (owner != root))
+			{
+				//fprintf(stderr, "[%d] Receiving data[%d] from %d\n", rank, x, owner);
+				starpu_mpi_irecv_detached(data_handles[x], owner, mpi_tag, comm, callback_func, callback_arg);
+			}
+			if ((rank != root) && (owner == rank))
+			{
+				//fprintf(stderr, "[%d] Sending data[%d] to %d\n", rank, x, root);
+				starpu_mpi_isend_detached(data_handles[x], root, mpi_tag, comm, callback_func, callback_arg);
+			}
+		}
+	}
+	return 0;
+}

+ 245 - 0
nmad/src/starpu_mpi_datatype.c

@@ -0,0 +1,245 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi_datatype.h>
+
+typedef void (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
+typedef void (*handle_free_datatype_func)(MPI_Datatype *);
+
+/*
+ * 	Matrix
+ */
+
+static void handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	unsigned nx = starpu_matrix_get_nx(data_handle);
+	unsigned ny = starpu_matrix_get_ny(data_handle);
+	unsigned ld = starpu_matrix_get_local_ld(data_handle);
+	size_t elemsize = starpu_matrix_get_elemsize(data_handle);
+
+	ret = MPI_Type_vector(ny, nx*elemsize, ld*elemsize, MPI_BYTE, datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_vector failed");
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+}
+
+/*
+ * 	Block
+ */
+
+static void handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	unsigned nx = starpu_block_get_nx(data_handle);
+	unsigned ny = starpu_block_get_ny(data_handle);
+	unsigned nz = starpu_block_get_nz(data_handle);
+	unsigned ldy = starpu_block_get_local_ldy(data_handle);
+	unsigned ldz = starpu_block_get_local_ldz(data_handle);
+	size_t elemsize = starpu_block_get_elemsize(data_handle);
+
+	MPI_Datatype datatype_2dlayer;
+	ret = MPI_Type_vector(ny, nx*elemsize, ldy*elemsize, MPI_BYTE, &datatype_2dlayer);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_vector failed");
+
+	ret = MPI_Type_commit(&datatype_2dlayer);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	ret = MPI_Type_hvector(nz, 1, ldz*elemsize, datatype_2dlayer, datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_hvector failed");
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+}
+
+/*
+ * 	Vector
+ */
+
+static void handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	unsigned nx = starpu_vector_get_nx(data_handle);
+	size_t elemsize = starpu_vector_get_elemsize(data_handle);
+
+	ret = MPI_Type_contiguous(nx*elemsize, MPI_BYTE, datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+}
+
+/*
+ * 	Variable
+ */
+
+static void handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	size_t elemsize = starpu_variable_get_elemsize(data_handle);
+
+	ret = MPI_Type_contiguous(elemsize, MPI_BYTE, datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+}
+
+/*
+ * 	Void
+ */
+
+static void handle_to_datatype_void(starpu_data_handle_t data_handle STARPU_ATTRIBUTE_UNUSED, MPI_Datatype *datatype)
+{
+	int ret;
+
+	ret = MPI_Type_contiguous(0, MPI_BYTE, datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+}
+
+/*
+ *	Generic
+ */
+
+static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
+{
+	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
+	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
+	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
+	[STARPU_CSR_INTERFACE_ID]	= NULL,
+	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
+	[STARPU_VOID_INTERFACE_ID]	= handle_to_datatype_void,
+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
+};
+
+void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype)
+{
+	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
+
+	if (id < STARPU_MAX_INTERFACE_ID)
+	{
+		handle_to_datatype_func func = handle_to_datatype_funcs[id];
+		STARPU_ASSERT_MSG(func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
+		func(data_handle, datatype);
+		*user_datatype = 0;
+	}
+	else
+	{
+		/* The datatype is not predefined by StarPU */
+		*datatype = MPI_BYTE;
+		*user_datatype = 1;
+	}
+}
+
+static void _starpu_mpi_handle_free_simple_datatype(MPI_Datatype *datatype)
+{
+	MPI_Type_free(datatype);
+}
+
+static void _starpu_mpi_handle_free_complex_datatype(MPI_Datatype *datatype)
+{
+	int num_ints, num_adds, num_datatypes, combiner, i;
+	int *array_of_ints;
+	MPI_Aint *array_of_adds;
+	MPI_Datatype *array_of_datatypes;
+
+	MPI_Type_get_envelope(*datatype, &num_ints, &num_adds, &num_datatypes, &combiner);
+	if (combiner != MPI_COMBINER_NAMED)
+	{
+		array_of_ints = (int *) malloc(num_ints * sizeof(int));
+		array_of_adds = (MPI_Aint *) malloc(num_adds * sizeof(MPI_Aint));
+		array_of_datatypes = (MPI_Datatype *) malloc(num_datatypes * sizeof(MPI_Datatype));
+		MPI_Type_get_contents(*datatype, num_ints, num_adds, num_datatypes, array_of_ints, array_of_adds, array_of_datatypes);
+		for(i=0 ; i<num_datatypes ; i++)
+		{
+			_starpu_mpi_handle_free_complex_datatype(&array_of_datatypes[i]);
+		}
+		MPI_Type_free(datatype);
+		free(array_of_ints);
+		free(array_of_adds);
+		free(array_of_datatypes);
+	}
+}
+
+static handle_free_datatype_func handle_free_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
+{
+	[STARPU_MATRIX_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
+	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
+	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
+	[STARPU_CSR_INTERFACE_ID]	= NULL,
+	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_VARIABLE_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
+	[STARPU_VOID_INTERFACE_ID]      = _starpu_mpi_handle_free_simple_datatype,
+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
+};
+
+void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	enum starpu_data_interface_id id = starpu_data_get_interface_id(data_handle);
+
+	if (id < STARPU_MAX_INTERFACE_ID)
+	{
+		handle_free_datatype_func func = handle_free_datatype_funcs[id];
+		STARPU_ASSERT_MSG(func, "Handle free datatype function not defined for StarPU data interface %d", id);
+		func(datatype);
+	}
+	/* else the datatype is not predefined by StarPU */
+}
+
+char *_starpu_mpi_datatype(MPI_Datatype datatype)
+{
+     if (datatype == MPI_DATATYPE_NULL) return "MPI_DATATYPE_NULL";
+     if (datatype == MPI_CHAR) return "MPI_CHAR";
+     if (datatype == MPI_UNSIGNED_CHAR) return "MPI_UNSIGNED_CHAR";
+     if (datatype == MPI_BYTE) return "MPI_BYTE";
+     if (datatype == MPI_SHORT) return "MPI_SHORT";
+     if (datatype == MPI_UNSIGNED_SHORT) return "MPI_UNSIGNED_SHORT";
+     if (datatype == MPI_INT) return "MPI_INT";
+     if (datatype == MPI_UNSIGNED) return "MPI_UNSIGNED";
+     if (datatype == MPI_LONG) return "MPI_LONG";
+     if (datatype == MPI_UNSIGNED_LONG) return "MPI_UNSIGNED_LONG";
+     if (datatype == MPI_FLOAT) return "MPI_FLOAT";
+     if (datatype == MPI_DOUBLE) return "MPI_DOUBLE";
+     if (datatype == MPI_LONG_DOUBLE) return "MPI_LONG_DOUBLE";
+     if (datatype == MPI_LONG_LONG) return "MPI_LONG_LONG";
+     if (datatype == MPI_LONG_INT) return "MPI_LONG_INT";
+     if (datatype == MPI_SHORT_INT) return "MPI_SHORT_INT";
+     if (datatype == MPI_FLOAT_INT) return "MPI_FLOAT_INT";
+     if (datatype == MPI_DOUBLE_INT) return "MPI_DOUBLE_INT";
+     if (datatype == MPI_2INT) return "MPI_2INT";
+     if (datatype == MPI_2DOUBLE_PRECISION) return "MPI_2DOUBLE_PRECISION";
+     if (datatype == MPI_COMPLEX) return "MPI_COMPLEX";
+     if (datatype == MPI_DOUBLE_COMPLEX) return "MPI_DOUBLE_COMPLEX";
+     if (datatype == MPI_LOGICAL) return "MPI_LOGICAL";
+     if (datatype == MPI_REAL) return "MPI_REAL";
+     if (datatype == MPI_REAL4) return "MPI_REAL4";
+     if (datatype == MPI_REAL8) return "MPI_REAL8";
+     if (datatype == MPI_DOUBLE_PRECISION) return "MPI_DOUBLE_PRECISION";
+     if (datatype == MPI_INTEGER) return "MPI_INTEGER";
+     if (datatype == MPI_INTEGER4) return "MPI_INTEGER4";
+     if (datatype == MPI_PACKED) return "MPI_PACKED";
+     return "User defined MPI Datatype";
+}

+ 35 - 0
nmad/src/starpu_mpi_datatype.h

@@ -0,0 +1,35 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux
+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_DATATYPE_H__
+#define __STARPU_MPI_DATATYPE_H__
+
+#include <starpu_mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void _starpu_mpi_handle_allocate_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype, int *user_datatype);
+void _starpu_mpi_handle_free_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
+char *_starpu_mpi_datatype(MPI_Datatype datatype);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_DATATYPE_H__

+ 116 - 0
nmad/src/starpu_mpi_fxt.h

@@ -0,0 +1,116 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_FXT_H__
+#define __STARPU_MPI_FXT_H__
+
+#include <starpu.h>
+#include <common/config.h>
+#include <common/fxt.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FUT_MPI_START				0x5201
+#define FUT_MPI_STOP				0x5202
+#define FUT_MPI_BARRIER				0x5203
+#define FUT_MPI_ISEND_SUBMIT_BEGIN		0x5204
+#define FUT_MPI_ISEND_SUBMIT_END		0x5205
+#define FUT_MPI_IRECV_SUBMIT_BEGIN		0x5206
+#define FUT_MPI_IRECV_SUBMIT_END		0x5207
+#define FUT_MPI_ISEND_COMPLETE_BEGIN		0x5208
+#define FUT_MPI_ISEND_COMPLETE_END		0x5209
+#define FUT_MPI_IRECV_COMPLETE_BEGIN		0x5210
+#define FUT_MPI_IRECV_COMPLETE_END		0x5211
+#define FUT_MPI_SLEEP_BEGIN			0x5212
+#define FUT_MPI_SLEEP_END			0x5213
+#define FUT_MPI_DTESTING_BEGIN			0x5214
+#define FUT_MPI_DTESTING_END			0x5215
+#define FUT_MPI_UTESTING_BEGIN			0x5216
+#define FUT_MPI_UTESTING_END			0x5217
+#define FUT_MPI_UWAIT_BEGIN			0x5218
+#define FUT_MPI_UWAIT_END			0x5219
+
+#ifdef STARPU_USE_FXT
+#define TRACE_MPI_START(rank, worldsize)	\
+	FUT_DO_PROBE3(FUT_MPI_START, (rank), (worldsize), _starpu_gettid());
+#define TRACE_MPI_STOP(rank, worldsize)	\
+	FUT_DO_PROBE3(FUT_MPI_STOP, (rank), (worldsize), _starpu_gettid());
+#define TRACE_MPI_BARRIER(rank, worldsize, key)	\
+	FUT_DO_PROBE4(FUT_MPI_BARRIER, (rank), (worldsize), (key), _starpu_gettid());
+#define TRACE_MPI_ISEND_SUBMIT_BEGIN(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+#define TRACE_MPI_ISEND_SUBMIT_END(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(FUT_MPI_ISEND_SUBMIT_END, (dest), (mpi_tag), (size), _starpu_gettid());
+#define TRACE_MPI_IRECV_SUBMIT_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_BEGIN, (src), (mpi_tag), _starpu_gettid());
+#define TRACE_MPI_IRECV_SUBMIT_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_IRECV_SUBMIT_END, (src), (mpi_tag), _starpu_gettid());
+#define TRACE_MPI_ISEND_COMPLETE_BEGIN(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_BEGIN, (dest), (mpi_tag), (size), _starpu_gettid());
+#define TRACE_MPI_ISEND_COMPLETE_END(dest, mpi_tag, size)	\
+	FUT_DO_PROBE4(FUT_MPI_ISEND_COMPLETE_END, (dest), (mpi_tag), (size), _starpu_gettid());
+#define TRACE_MPI_IRECV_COMPLETE_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_BEGIN, (src), (mpi_tag), _starpu_gettid());
+#define TRACE_MPI_IRECV_COMPLETE_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_IRECV_COMPLETE_END, (src), (mpi_tag), _starpu_gettid());
+#define TRACE_MPI_SLEEP_BEGIN()	\
+	FUT_DO_PROBE1(FUT_MPI_SLEEP_BEGIN, _starpu_gettid());
+#define TRACE_MPI_SLEEP_END()	\
+	FUT_DO_PROBE1(FUT_MPI_SLEEP_END, _starpu_gettid());
+#define TRACE_MPI_DTESTING_BEGIN()	\
+	FUT_DO_PROBE1(FUT_MPI_DTESTING_BEGIN,  _starpu_gettid());
+#define TRACE_MPI_DTESTING_END()	\
+	FUT_DO_PROBE1(FUT_MPI_DTESTING_END, _starpu_gettid());
+#define TRACE_MPI_UTESTING_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_UTESTING_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+#define TRACE_MPI_UTESTING_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_UTESTING_END, (src), (mpi_tag), _starpu_gettid());
+#define TRACE_MPI_UWAIT_BEGIN(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_UWAIT_BEGIN, (src), (mpi_tag),  _starpu_gettid());
+#define TRACE_MPI_UWAIT_END(src, mpi_tag)	\
+	FUT_DO_PROBE3(FUT_MPI_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
+#define TRACE
+#else
+#define TRACE_MPI_START(a, b)				do {} while(0);
+#define TRACE_MPI_STOP(a, b)				do {} while(0);
+#define TRACE_MPI_BARRIER(a, b, c)			do {} while(0);
+#define TRACE_MPI_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
+#define TRACE_MPI_ISEND_SUBMIT_END(a, b, c)		do {} while(0);
+#define TRACE_MPI_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
+#define TRACE_MPI_IRECV_SUBMIT_END(a, b)		do {} while(0);
+#define TRACE_MPI_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
+#define TRACE_MPI_ISEND_COMPLETE_END(a, b, c)		do {} while(0);
+#define TRACE_MPI_IRECV_COMPLETE_BEGIN(a, b)		do {} while(0);
+#define TRACE_MPI_IRECV_COMPLETE_END(a, b)		do {} while(0);
+#define TRACE_MPI_SLEEP_BEGIN()				do {} while(0);
+#define TRACE_MPI_SLEEP_END()				do {} while(0);
+#define TRACE_MPI_DTESTING_BEGIN()			do {} while(0);
+#define TRACE_MPI_DTESTING_END()			do {} while(0);
+#define TRACE_MPI_UTESTING_BEGIN(a, b)			do {} while(0);
+#define TRACE_MPI_UTESTING_END(a, b)			do {} while(0);
+#define TRACE_MPI_UWAIT_BEGIN(a, b)			do {} while(0);
+#define TRACE_MPI_UWAIT_END(a, b)			do {} while(0);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif // __STARPU_MPI_FXT_H__

+ 105 - 0
nmad/src/starpu_mpi_helper.c

@@ -0,0 +1,105 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+
+static void starpu_mpi_unlock_tag_callback(void *arg)
+{
+	starpu_tag_t *tagptr = arg;
+
+	starpu_tag_notify_from_apps(*tagptr);
+
+	free(tagptr);
+}
+
+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle,
+				int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
+{
+	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
+	*tagptr = tag;
+
+	return starpu_mpi_isend_detached(data_handle, dest, mpi_tag, comm,
+						starpu_mpi_unlock_tag_callback, tagptr);
+}
+
+
+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
+{
+	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
+	*tagptr = tag;
+
+	return starpu_mpi_irecv_detached(data_handle, source, mpi_tag, comm,
+						starpu_mpi_unlock_tag_callback, tagptr);
+}
+
+struct arg_array
+{
+	int array_size;
+	starpu_tag_t tag;
+};
+
+static void starpu_mpi_array_unlock_callback(void *_arg)
+{
+	struct arg_array *arg = _arg;
+
+	int remaining = STARPU_ATOMIC_ADD(&arg->array_size, -1);
+
+	if (remaining == 0)
+	{
+		starpu_tag_notify_from_apps(arg->tag);
+		free(arg);
+	}
+}
+
+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
+		starpu_data_handle_t *data_handle, int *dest, int *mpi_tag,
+		MPI_Comm *comm, starpu_tag_t tag)
+{
+	struct arg_array *arg = malloc(sizeof(struct arg_array));
+
+	arg->array_size = array_size;
+	arg->tag = tag;
+
+	unsigned elem;
+	for (elem = 0; elem < array_size; elem++)
+	{
+		starpu_mpi_isend_detached(data_handle[elem], dest[elem],
+				mpi_tag[elem], comm[elem],
+				starpu_mpi_array_unlock_callback, arg);
+	}
+
+	return 0;
+}
+
+
+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
+{
+	struct arg_array *arg = malloc(sizeof(struct arg_array));
+
+	arg->array_size = array_size;
+	arg->tag = tag;
+
+	unsigned elem;
+	for (elem = 0; elem < array_size; elem++)
+	{
+		starpu_mpi_irecv_detached(data_handle[elem], source[elem],
+				mpi_tag[elem], comm[elem],
+				starpu_mpi_array_unlock_callback, arg);
+	}
+
+	return 0;
+}

+ 25 - 0
nmad/src/starpu_mpi_private.c

@@ -0,0 +1,25 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+int _starpu_debug_rank=-1;
+int _starpu_debug_level=0;
+
+void _starpu_mpi_set_debug_level(int level)
+{
+	_starpu_debug_level = level;
+}
+

+ 173 - 0
nmad/src/starpu_mpi_private.h

@@ -0,0 +1,173 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012-2015  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_PRIVATE_H__
+#define __STARPU_MPI_PRIVATE_H__
+
+#include <starpu.h>
+#include <common/config.h>
+#include "starpu_mpi.h"
+#include "starpu_mpi_fxt.h"
+#include <common/list.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int _starpu_debug_rank;
+
+#ifdef STARPU_VERBOSE
+extern int _starpu_debug_level;
+void _starpu_mpi_set_debug_level(int level);
+#endif
+
+#ifdef STARPU_NO_ASSERT
+#  define STARPU_MPI_ASSERT_MSG(x, msg, ...)	do { if (0) { (void) (x); }} while(0)
+#else
+#  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
+int _starpu_debug_rank;
+#    define STARPU_MPI_ASSERT_MSG(x, msg, ...)									\
+	do													\
+	{ 													\
+		if (STARPU_UNLIKELY(!(x))) 									\
+		{												\
+			if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
+			fprintf(stderr, "\n[%d][starpu_mpi][%s][assert failure] " msg "\n\n", _starpu_debug_rank, __starpu_func__, ## __VA_ARGS__); *(int*)NULL = 0; \
+		} \
+	} while(0)
+#  else
+#    define STARPU_MPI_ASSERT_MSG(x, msg, ...)	\
+	do \
+	{ \
+		if (STARPU_UNLIKELY(!(x))) \
+		{ \
+			if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
+			fprintf(stderr, "\n[%d][starpu_mpi][%s][assert failure] " msg "\n\n", _starpu_debug_rank, __starpu_func__, ## __VA_ARGS__); \
+		} \
+		assert(x); \
+	} while(0)
+
+#  endif
+#endif
+	
+#define _STARPU_MPI_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) size); } while (0)
+#define _STARPU_MPI_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
+#define _STARPU_MPI_REALLOC(ptr, size) do { ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); } while (0)
+
+#ifdef STARPU_VERBOSE
+#  define _STARPU_MPI_DEBUG(level, fmt, ...) \
+	do \
+	{								\
+		if (!_starpu_silent && level <= _starpu_debug_level)	\
+		{							\
+			if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
+			fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ,## __VA_ARGS__); \
+			fflush(stderr); \
+		}			\
+	} while(0);
+#else
+#  define _STARPU_MPI_DEBUG(level, fmt, ...)
+#endif
+
+#define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
+	       				     if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
+                                             fprintf(stderr, "%*s[%d][starpu_mpi][%s] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ,## __VA_ARGS__); \
+                                             fflush(stderr); }} while(0);
+#define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
+                                             fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
+                                             fflush(stderr); } while(0);
+
+#ifdef STARPU_VERBOSE0
+#  define _STARPU_MPI_LOG_IN()             do { if (!_starpu_silent) { \
+                                               if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank);                        \
+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] -->\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ); \
+                                               fflush(stderr); }} while(0)
+#  define _STARPU_MPI_LOG_OUT()            do { if (!_starpu_silent) { \
+                                               if (_starpu_debug_rank == -1) MPI_Comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank);                        \
+                                               fprintf(stderr, "%*s[%d][starpu_mpi][%s] <--\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ ); \
+                                               fflush(stderr); }} while(0)
+#else
+#  define _STARPU_MPI_LOG_IN()
+#  define _STARPU_MPI_LOG_OUT()
+#endif
+
+enum _starpu_mpi_request_type
+{
+	SEND_REQ=0,
+	RECV_REQ=1,
+	WAIT_REQ=2,
+	TEST_REQ=3,
+	BARRIER_REQ=4,
+	PROBE_REQ=5
+};
+
+LIST_TYPE(_starpu_mpi_req,
+	/* description of the data at StarPU level */
+	starpu_data_handle_t data_handle;
+
+	/* description of the data to be sent/received */
+	MPI_Datatype datatype;
+	void *ptr;
+	starpu_ssize_t count;
+	int user_datatype;
+
+	/* who are we talking to ? */
+	int srcdst;
+	int mpi_tag;
+	MPI_Comm comm;
+
+	void (*func)(struct _starpu_mpi_req *);
+
+	MPI_Status *status;
+	MPI_Request request;
+	int *flag;
+	unsigned sync;
+
+	int ret;
+	starpu_pthread_mutex_t req_mutex;
+	starpu_pthread_cond_t req_cond;
+
+	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
+
+	unsigned submitted;
+	unsigned completed;
+
+	/* In the case of a Wait/Test request, we are going to post a request
+	 * to test the completion of another request */
+	struct _starpu_mpi_req *other_request;
+
+	/* in the case of detached requests */
+	unsigned detached;
+	void *callback_arg;
+	void (*callback)(void *);
+
+        /* in the case of user-defined datatypes, we need to send the size of the data */
+	MPI_Request size_req;
+);
+
+struct _starpu_mpi_data
+{
+	int tag;
+	int rank;
+	MPI_Comm comm;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_PRIVATE_H__

+ 117 - 0
nmad/src/starpu_mpi_select_node.c

@@ -0,0 +1,117 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014, 2015, 2016  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdarg.h>
+#include <mpi.h>
+
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <starpu_data.h>
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_select_node.h>
+#include <datawizard/coherency.h>
+
+static int _current_policy = STARPU_MPI_NODE_SELECTION_MOST_R_DATA;
+static int _last_predefined_policy = STARPU_MPI_NODE_SELECTION_MOST_R_DATA;
+static starpu_mpi_select_node_policy_func_t _policies[_STARPU_MPI_NODE_SELECTION_MAX_POLICY];
+
+int _starpu_mpi_select_node_with_most_R_data(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data);
+
+void _starpu_mpi_select_node_init()
+{
+	int i;
+
+	_policies[STARPU_MPI_NODE_SELECTION_MOST_R_DATA] = _starpu_mpi_select_node_with_most_R_data;
+	for(i=_last_predefined_policy+1 ; i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY ; i++)
+		_policies[i] = NULL;
+}
+
+int starpu_mpi_node_selection_get_current_policy()
+{
+	return _current_policy;
+}
+
+int starpu_mpi_node_selection_set_current_policy(int policy)
+{
+	STARPU_ASSERT_MSG(_policies[policy] != NULL, "Policy %d invalid.\n", policy);
+	_current_policy = policy;
+	return 0;
+}
+
+int starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_policy_func_t policy_func)
+{
+	int i=_last_predefined_policy+1;
+	// Look for a unregistered policy
+	while(i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY)
+	{
+		if (_policies[i] == NULL) break;
+		i++;
+	}
+	STARPU_ASSERT_MSG(i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY, "No unused policy available. Unregister existing policies before registering a new one.");
+	_policies[i] = policy_func;
+	return i;
+}
+
+int starpu_mpi_node_selection_unregister_policy(int policy)
+{
+	STARPU_ASSERT_MSG(policy > _last_predefined_policy, "Policy %d invalid. Only user-registered policies can be unregistered\n", policy);
+	_policies[policy] = NULL;
+	return 0;
+}
+
+int _starpu_mpi_select_node_with_most_R_data(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data)
+{
+	size_t *size_on_nodes;
+	size_t max_size;
+	int i;
+	int xrank = 0;
+
+	(void)me;
+	_STARPU_MPI_CALLOC(size_on_nodes, nb_nodes, sizeof(size_t));
+
+	for(i= 0 ; i<nb_data ; i++)
+	{
+		starpu_data_handle_t data = descr[i].handle;
+		enum starpu_data_access_mode mode = descr[i].mode;
+		if (mode & STARPU_R)
+		{
+			int rank = starpu_data_get_rank(data);
+			size_on_nodes[rank] += data->ops->get_size(data);
+		}
+	}
+
+	max_size = 0;
+	for(i=0 ; i<nb_nodes ; i++)
+	{
+		if (size_on_nodes[i] > max_size)
+		{
+			max_size = size_on_nodes[i];
+			xrank = i;
+		}
+	}
+
+	free(size_on_nodes);
+	return xrank;
+}
+
+int _starpu_mpi_select_node(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data, int policy)
+{
+	int ppolicy = policy == STARPU_MPI_NODE_SELECTION_CURRENT_POLICY ? _current_policy : policy;
+	STARPU_ASSERT_MSG(ppolicy < _STARPU_MPI_NODE_SELECTION_MAX_POLICY, "Invalid policy %d\n", ppolicy);
+	STARPU_ASSERT_MSG(_policies[ppolicy], "Unregistered policy %d\n", ppolicy);
+	starpu_mpi_select_node_policy_func_t func = _policies[ppolicy];
+	return func(me, nb_nodes, descr, nb_data);
+}

+ 36 - 0
nmad/src/starpu_mpi_select_node.h

@@ -0,0 +1,36 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2014, 2015, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_SELECT_NODE_H__
+#define __STARPU_MPI_SELECT_NODE_H__
+
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define _STARPU_MPI_NODE_SELECTION_MAX_POLICY 24
+
+void _starpu_mpi_select_node_init();
+int _starpu_mpi_select_node(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data, int policy);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_SELECT_NODE_H__

+ 94 - 0
nmad/src/starpu_mpi_stats.c

@@ -0,0 +1,94 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi_stats.h>
+#include <common/config.h>
+#include <stdio.h>
+#include <starpu_mpi_private.h>
+
+/* measure the amount of data transfers between each pair of MPI nodes */
+static size_t *comm_amount;
+static int world_size;
+static int stats_enabled=0;
+
+void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
+{
+	stats_enabled = starpu_get_env_number("STARPU_COMM_STATS");
+	if (stats_enabled == -1)
+	{
+		stats_enabled = 0;
+	}
+
+	if (stats_enabled == 0) return;
+
+	if (!_starpu_silent) fprintf(stderr,"Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
+
+	MPI_Comm_size(comm, &world_size);
+	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
+
+	comm_amount = (size_t *) calloc(world_size, sizeof(size_t));
+}
+
+void _starpu_mpi_comm_amounts_free()
+{
+	if (stats_enabled == 0) return;
+	free(comm_amount);
+}
+
+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count)
+{
+	int src, size;
+
+	if (stats_enabled == 0) return;
+
+	MPI_Comm_rank(comm, &src);
+	MPI_Type_size(datatype, &size);
+
+	_STARPU_MPI_DEBUG(1, "[%d] adding %d to %d\n", src, count*size, dst);
+
+	comm_amount[dst] += count*size;
+}
+
+void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
+{
+	if (stats_enabled == 0) return;
+	memcpy(comm_amounts, comm_amount, world_size * sizeof(size_t));
+}
+
+void _starpu_mpi_comm_amounts_display(int node)
+{
+	int dst;
+	size_t sum = 0;
+
+	if (stats_enabled == 0) return;
+
+	for (dst = 0; dst < world_size; dst++)
+	{
+		sum += comm_amount[dst];
+	}
+
+	fprintf(stderr, "\n[starpu_comm_stats][%d] TOTAL:\t%f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
+
+	for (dst = 0; dst < world_size; dst++)
+	{
+		if (comm_amount[dst])
+		{
+			fprintf(stderr, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\n",
+				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
+		}
+	}
+}
+

+ 36 - 0
nmad/src/starpu_mpi_stats.h

@@ -0,0 +1,36 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_STATS_H__
+#define __STARPU_MPI_STATS_H__
+
+#include <stdlib.h>
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
+void _starpu_mpi_comm_amounts_free();
+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count);
+void _starpu_mpi_comm_amounts_display(int node);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_STATS_H__

+ 775 - 0
nmad/src/starpu_mpi_task_insert.c

@@ -0,0 +1,775 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
+ * Copyright (C) 2011-2017  Université de Bordeaux
+ * Copyright (C) 2014, 2016 Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdarg.h>
+#include <mpi.h>
+
+#include <starpu.h>
+#include <starpu_data.h>
+#include <common/utils.h>
+#include <util/starpu_task_insert_utils.h>
+#include <datawizard/coherency.h>
+#include <core/task.h>
+
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_cache.h>
+#include <starpu_mpi_select_node.h>
+
+#define _SEND_DATA(data, mode, dest, data_tag, comm, callback, arg)     \
+	if (mode & STARPU_SSEND)					\
+		starpu_mpi_issend_detached(data, dest, data_tag, comm, callback, arg); \
+	else								\
+		starpu_mpi_isend_detached(data, dest, data_tag, comm, callback, arg);
+
+static void (*pre_submit_hook)(struct starpu_task *task) = NULL;
+
+int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *))
+{
+	if (pre_submit_hook)
+		_STARPU_MSG("Warning: a pre_submit_hook has already been registered. Please check if you really want to erase the previously registered hook.\n");
+	pre_submit_hook = f;
+	return 0;
+}
+
+int starpu_mpi_pre_submit_hook_unregister()
+{
+	pre_submit_hook = NULL;
+	return 0;
+}
+
+int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank)
+{
+	if (mode & STARPU_W)
+	{
+		if (!data)
+		{
+			/* We don't have anything allocated for this.
+			 * The application knows we won't do anything
+			 * about this task */
+			/* Yes, the app could actually not call
+			 * task_insert at all itself, this is just a
+			 * safeguard. */
+			_STARPU_MPI_DEBUG(3, "oh oh\n");
+			_STARPU_MPI_LOG_OUT();
+			return -EINVAL;
+		}
+
+		int mpi_rank = starpu_mpi_data_get_rank(data);
+		if (mpi_rank == -1)
+		{
+			_STARPU_ERROR("Data %p with mode STARPU_W needs to have a valid rank", data);
+		}
+
+		if (*xrank == -1)
+		{
+			// No node has been selected yet
+			*xrank = mpi_rank;
+			_STARPU_MPI_DEBUG(100, "Codelet is going to be executed by node %d\n", *xrank);
+			*do_execute = (mpi_rank == me);
+		}
+		else if (mpi_rank != *xrank)
+		{
+			_STARPU_MPI_DEBUG(100, "Another node %d had already been selected to execute the codelet\n", *xrank);
+			*inconsistent_execute = 1;
+		}
+	}
+	_STARPU_MPI_DEBUG(100, "Executing: inconsistent=%d, do_execute=%d, xrank=%d\n", *inconsistent_execute, *do_execute, *xrank);
+	return 0;
+}
+
+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm)
+{
+	if (data && mode & STARPU_R)
+	{
+		int mpi_rank = starpu_mpi_data_get_rank(data);
+		int data_tag = starpu_mpi_data_get_tag(data);
+		if (mpi_rank == -1)
+		{
+			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
+		}
+
+		if (do_execute && mpi_rank != me)
+		{
+			/* The node is going to execute the codelet, but it does not own the data, it needs to receive the data from the owner node */
+			int already_received = _starpu_mpi_cache_received_data_set(data);
+			if (already_received == 0)
+			{
+				if (data_tag == -1)
+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
+				_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data, mpi_rank);
+				starpu_mpi_irecv_detached(data, mpi_rank, data_tag, comm, NULL, NULL);
+			}
+			// else the node has already received the data
+		}
+
+		if (!do_execute && mpi_rank == me)
+		{
+			/* The node owns the data, but another node is going to execute the codelet, the node needs to send the data to the executee node. */
+			int already_sent = _starpu_mpi_cache_sent_data_set(data, xrank);
+			if (already_sent == 0)
+			{
+				if (data_tag == -1)
+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
+				_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data, xrank);
+				_SEND_DATA(data, mode, xrank, data_tag, comm, NULL, NULL);
+			}
+			// Else the data has already been sent
+		}
+	}
+}
+
+static
+void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm)
+{
+	if (mode & STARPU_W)
+	{
+		int mpi_rank = starpu_mpi_data_get_rank(data);
+		int data_tag = starpu_mpi_data_get_tag(data);
+		if(mpi_rank == -1)
+		{
+			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
+		}
+		if (mpi_rank == me)
+		{
+			if (xrank != -1 && me != xrank)
+			{
+				_STARPU_MPI_DEBUG(1, "Receive data %p back from the task %d which executed the codelet ...\n", data, xrank);
+				if(data_tag == -1)
+					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
+				starpu_mpi_irecv_detached(data, xrank, data_tag, comm, NULL, NULL);
+			}
+		}
+		else if (do_execute)
+		{
+			if(data_tag == -1)
+				_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
+			_STARPU_MPI_DEBUG(1, "Send data %p back to its owner %d...\n", data, mpi_rank);
+			_SEND_DATA(data, mode, mpi_rank, data_tag, comm, NULL, NULL);
+		}
+	}
+}
+
+static
+void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int do_execute)
+{
+	if (_starpu_cache_enabled)
+	{
+		if (mode & STARPU_W || mode & STARPU_REDUX)
+		{
+			/* The data has been modified, it MUST be removed from the cache */
+			_starpu_mpi_cache_sent_data_clear(data);
+			_starpu_mpi_cache_received_data_clear(data);
+		}
+	}
+	else
+	{
+		/* We allocated a temporary buffer for the received data, now drop it */
+		if ((mode & STARPU_R) && do_execute)
+		{
+			int mpi_rank = starpu_mpi_data_get_rank(data);
+			if (mpi_rank != me && mpi_rank != -1)
+			{
+				starpu_data_invalidate_submit(data);
+			}
+		}
+	}
+}
+
+static
+int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, va_list varg_list)
+{
+	va_list varg_list_copy;
+	int inconsistent_execute = 0;
+	int arg_type;
+	int node_selected = 0;
+	int nb_allocated_data = 16;
+	struct starpu_data_descr *descrs;
+	int nb_data;
+	int select_node_policy = STARPU_MPI_NODE_SELECTION_CURRENT_POLICY;
+
+	_STARPU_TRACE_TASK_MPI_DECODE_START();
+
+	_STARPU_MPI_MALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
+	nb_data = 0;
+	*do_execute = -1;
+	*xrank = -1;
+
+	va_copy(varg_list_copy, varg_list);
+	while ((arg_type = va_arg(varg_list_copy, int)) != 0)
+	{
+		int arg_type_nocommute = arg_type & ~STARPU_COMMUTE;
+		if (arg_type==STARPU_EXECUTE_ON_NODE)
+		{
+			*xrank = va_arg(varg_list_copy, int);
+			if (node_selected == 0)
+			{
+				_STARPU_MPI_DEBUG(100, "Executing on node %d\n", *xrank);
+				*do_execute = 1;
+				node_selected = 1;
+				inconsistent_execute = 0;
+			}
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_DATA)
+		{
+			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
+			if (node_selected == 0)
+			{
+				*xrank = starpu_mpi_data_get_rank(data);
+				STARPU_ASSERT_MSG(*xrank != -1, "Rank of the data must be set using starpu_mpi_data_register() or starpu_data_set_rank()");
+				_STARPU_MPI_DEBUG(100, "Executing on data node %d\n", *xrank);
+				STARPU_ASSERT_MSG(*xrank <= nb_nodes, "Node %d to execute codelet is not a valid node (%d)", *xrank, nb_nodes);
+				*do_execute = 1;
+				node_selected = 1;
+				inconsistent_execute = 0;
+			}
+		}
+		else if (arg_type_nocommute & STARPU_R || arg_type_nocommute & STARPU_W || arg_type_nocommute & STARPU_RW || arg_type & STARPU_SCRATCH || arg_type & STARPU_REDUX)
+		{
+			starpu_data_handle_t data = va_arg(varg_list_copy, starpu_data_handle_t);
+			enum starpu_data_access_mode mode = (enum starpu_data_access_mode) arg_type;
+			if (node_selected == 0)
+			{
+				int ret = _starpu_mpi_find_executee_node(data, mode, me, do_execute, &inconsistent_execute, xrank);
+				if (ret == -EINVAL)
+				{
+					free(descrs);
+					va_end(varg_list_copy);
+					_STARPU_TRACE_TASK_MPI_DECODE_END();
+					return ret;
+				}
+			}
+			if (nb_data >= nb_allocated_data)
+			{
+				nb_allocated_data *= 2;
+				_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
+			}
+			descrs[nb_data].handle = data;
+			descrs[nb_data].mode = mode;
+			nb_data ++;
+		}
+		else if (arg_type == STARPU_DATA_ARRAY)
+		{
+			starpu_data_handle_t *datas = va_arg(varg_list_copy, starpu_data_handle_t *);
+			int nb_handles = va_arg(varg_list_copy, int);
+			int i;
+
+			for(i=0 ; i<nb_handles ; i++)
+			{
+				STARPU_ASSERT_MSG(codelet->nbuffers == STARPU_VARIABLE_NBUFFERS || nb_data < codelet->nbuffers, "Too many data passed to starpu_mpi_task_insert");
+				enum starpu_data_access_mode mode = STARPU_CODELET_GET_MODE(codelet, nb_data);
+				if (node_selected == 0)
+				{
+					int ret = _starpu_mpi_find_executee_node(datas[i], mode, me, do_execute, &inconsistent_execute, xrank);
+					if (ret == -EINVAL)
+					{
+						free(descrs);
+						va_end(varg_list_copy);
+						_STARPU_TRACE_TASK_MPI_DECODE_END();
+						return ret;
+					}
+				}
+				if (nb_data >= nb_allocated_data)
+				{
+					nb_allocated_data *= 2;
+					_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
+				}
+				descrs[nb_data].handle = datas[i];
+				descrs[nb_data].mode = mode;
+				nb_data ++;
+			}
+		}
+		else if (arg_type == STARPU_DATA_MODE_ARRAY)
+		{
+			struct starpu_data_descr *_descrs = va_arg(varg_list_copy, struct starpu_data_descr*);
+			int nb_handles = va_arg(varg_list_copy, int);
+			int i;
+
+			for(i=0 ; i<nb_handles ; i++)
+			{
+				enum starpu_data_access_mode mode = _descrs[i].mode;
+				if (node_selected == 0)
+				{
+					int ret = _starpu_mpi_find_executee_node(_descrs[i].handle, mode, me, do_execute, &inconsistent_execute, xrank);
+					if (ret == -EINVAL)
+					{
+						free(descrs);
+						va_end(varg_list_copy);
+						_STARPU_TRACE_TASK_MPI_DECODE_END();
+						return ret;
+					}
+				}
+				if (nb_data >= nb_allocated_data)
+				{
+					nb_allocated_data *= 2;
+					_STARPU_MPI_REALLOC(descrs, nb_allocated_data * sizeof(struct starpu_data_descr));
+				}
+				descrs[nb_data].handle = _descrs[i].handle;
+				descrs[nb_data].mode = mode;
+				nb_data ++;
+			}
+		}
+		else if (arg_type==STARPU_VALUE)
+		{
+			(void)va_arg(varg_list_copy, void *);
+			(void)va_arg(varg_list_copy, size_t);
+		}
+		else if (arg_type==STARPU_CL_ARGS)
+		{
+			(void)va_arg(varg_list, void *);
+			(void)va_arg(varg_list, size_t);
+		}
+		else if (arg_type==STARPU_CALLBACK)
+		{
+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
+		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG)
+		{
+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
+			(void)va_arg(varg_list_copy, void *);
+		}
+		else if (arg_type==STARPU_CALLBACK_ARG)
+		{
+			(void)va_arg(varg_list_copy, void *);
+		}
+		else if (arg_type==STARPU_PRIORITY)
+		{
+			(void)va_arg(varg_list_copy, int);
+		}
+		/* STARPU_EXECUTE_ON_NODE handled above */
+		/* STARPU_EXECUTE_ON_DATA handled above */
+		/* STARPU_DATA_ARRAY handled above */
+		/* STARPU_DATA_MODE_ARRAY handled above */
+		else if (arg_type==STARPU_TAG)
+		{
+			(void)va_arg(varg_list_copy, starpu_tag_t);
+		}
+		else if (arg_type==STARPU_HYPERVISOR_TAG)
+		{
+			(void)va_arg(varg_list_copy, int);
+		}
+		else if (arg_type==STARPU_FLOPS)
+		{
+			(void)va_arg(varg_list_copy, double);
+		}
+		else if (arg_type==STARPU_SCHED_CTX)
+		{
+			(void)va_arg(varg_list_copy, unsigned);
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
+                {
+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
+		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
+                {
+                        (void)va_arg(varg_list_copy, void *);
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+                {
+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
+                }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+                {
+                        (void)va_arg(varg_list_copy, void *);
+		}
+		else if (arg_type==STARPU_EXECUTE_ON_WORKER)
+		{
+			// the flag is decoded and set later when
+			// calling function _starpu_task_insert_create()
+			(void)va_arg(varg_list_copy, int);
+		}
+		else if (arg_type==STARPU_TAG_ONLY)
+		{
+			(void)va_arg(varg_list_copy, starpu_tag_t);
+		}
+		else if (arg_type==STARPU_NAME)
+		{
+			(void)va_arg(varg_list_copy, const char *);
+		}
+		else if (arg_type==STARPU_POSSIBLY_PARALLEL)
+		{
+			(void)va_arg(varg_list_copy, unsigned);
+		}
+		else if (arg_type==STARPU_WORKER_ORDER)
+		{
+			// the flag is decoded and set later when
+			// calling function _starpu_task_insert_create()
+			(void)va_arg(varg_list_copy, unsigned);
+		}
+		else if (arg_type==STARPU_NODE_SELECTION_POLICY)
+		{
+			select_node_policy = va_arg(varg_list_copy, int);
+		}
+		else
+		{
+			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
+		}
+
+	}
+	va_end(varg_list_copy);
+
+	if (inconsistent_execute == 1 || *xrank == -1)
+	{
+		// We need to find out which node is going to execute the codelet.
+		_STARPU_MPI_DISP("Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
+		*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
+		*do_execute = (me == *xrank);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(100, "Inconsistent=%d - xrank=%d\n", inconsistent_execute, *xrank);
+		*do_execute = (me == *xrank);
+	}
+	_STARPU_MPI_DEBUG(100, "do_execute=%d\n", *do_execute);
+
+	*descrs_p = descrs;
+	*nb_data_p = nb_data;
+
+	_STARPU_TRACE_TASK_MPI_DECODE_END();
+	return 0;
+}
+
+static
+int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, va_list varg_list)
+{
+	int me, do_execute, xrank, nb_nodes;
+	int ret;
+	int i;
+	struct starpu_data_descr *descrs;
+	int nb_data;
+
+	_STARPU_MPI_LOG_IN();
+
+	starpu_mpi_comm_rank(comm, &me);
+	starpu_mpi_comm_size(comm, &nb_nodes);
+
+	/* Find out whether we are to execute the data because we own the data to be written to. */
+	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
+	if (ret < 0) return ret;
+
+	_STARPU_TRACE_TASK_MPI_PRE_START();
+	/* Send and receive data as requested */
+	for(i=0 ; i<nb_data ; i++)
+	{
+		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
+	}
+
+	if (xrank_p) *xrank_p = xrank;
+	if (nb_data_p) *nb_data_p = nb_data;
+	if (descrs_p)
+		*descrs_p = descrs;
+	else
+		free(descrs);
+	_STARPU_TRACE_TASK_MPI_PRE_END();
+
+	if (do_execute == 0) return 1;
+	else
+	{
+		va_list varg_list_copy;
+		_STARPU_MPI_DEBUG(100, "Execution of the codelet %p (%s)\n", codelet, codelet?codelet->name:NULL);
+
+		*task = starpu_task_create();
+		(*task)->cl_arg_free = 1;
+
+		va_copy(varg_list_copy, varg_list);
+		_starpu_task_insert_create(codelet, task, varg_list_copy);
+		va_end(varg_list_copy);
+
+		return 0;
+	}
+}
+
+int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data)
+{
+	int me, i;
+
+	_STARPU_TRACE_TASK_MPI_POST_START();
+	starpu_mpi_comm_rank(comm, &me);
+
+	for(i=0 ; i<nb_data ; i++)
+	{
+		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
+		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
+	}
+
+	free(descrs);
+
+	_STARPU_TRACE_TASK_MPI_POST_END();
+	_STARPU_MPI_LOG_OUT();
+	return 0;
+}
+
+static
+int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_list varg_list)
+{
+	struct starpu_task *task;
+	int ret;
+	int xrank;
+	int do_execute = 0;
+	struct starpu_data_descr *descrs;
+	int nb_data;
+
+	ret = _starpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, varg_list);
+	if (ret < 0) return ret;
+
+	if (ret == 0)
+	{
+		do_execute = 1;
+		ret = starpu_task_submit(task);
+
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			_STARPU_MSG("submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
+				    task, task->cl,
+				    (codelet == NULL) ? "none" :
+				    task->cl->name ? task->cl->name :
+				    (task->cl->model && task->cl->model->symbol)?task->cl->model->symbol:"none");
+
+			task->destroy = 0;
+			starpu_task_destroy(task);
+		}
+	}
+
+	int val = _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
+
+	if (ret == 0 && pre_submit_hook)
+		pre_submit_hook(task);
+
+	return val;
+}
+
+int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+{
+	va_list varg_list;
+	int ret;
+
+	va_start(varg_list, codelet);
+	ret = _starpu_mpi_task_insert_v(comm, codelet, varg_list);
+	va_end(varg_list);
+	return ret;
+}
+
+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+{
+	va_list varg_list;
+	int ret;
+
+	va_start(varg_list, codelet);
+	ret = _starpu_mpi_task_insert_v(comm, codelet, varg_list);
+	va_end(varg_list);
+	return ret;
+}
+
+struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+{
+	va_list varg_list;
+	struct starpu_task *task;
+	int ret;
+
+	va_start(varg_list, codelet);
+	ret = _starpu_mpi_task_build_v(comm, codelet, &task, NULL, NULL, NULL, varg_list);
+	va_end(varg_list);
+	STARPU_ASSERT(ret >= 0);
+	if (ret > 0) return NULL; else return task;
+}
+
+int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+{
+	int xrank, do_execute;
+	int ret, me, nb_nodes;
+	va_list varg_list;
+	struct starpu_data_descr *descrs;
+	int nb_data;
+
+	starpu_mpi_comm_rank(comm, &me);
+	starpu_mpi_comm_size(comm, &nb_nodes);
+
+	va_start(varg_list, codelet);
+	/* Find out whether we are to execute the data because we own the data to be written to. */
+	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
+	va_end(varg_list);
+	if (ret < 0) return ret;
+
+	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
+}
+
+struct _starpu_mpi_redux_data_args
+{
+	starpu_data_handle_t data_handle;
+	starpu_data_handle_t new_handle;
+	int tag;
+	int node;
+	MPI_Comm comm;
+	struct starpu_task *taskB;
+};
+
+void _starpu_mpi_redux_data_dummy_func(STARPU_ATTRIBUTE_UNUSED void *buffers[], STARPU_ATTRIBUTE_UNUSED void *cl_arg)
+{
+}
+
+/* Dummy cost function for simgrid */
+static double cost_function(struct starpu_task *task STARPU_ATTRIBUTE_UNUSED, unsigned nimpl STARPU_ATTRIBUTE_UNUSED)
+{
+	return 0.000001;
+}
+static struct starpu_perfmodel dumb_model =
+{
+	.type		= STARPU_COMMON,
+	.cost_function	= cost_function
+};
+
+static
+struct starpu_codelet _starpu_mpi_redux_data_read_cl =
+{
+	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func},
+	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func},
+	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func},
+	.nbuffers = 1,
+	.modes = {STARPU_R},
+	.model = &dumb_model,
+	.name = "_starpu_mpi_redux_data_read_cl"
+};
+
+struct starpu_codelet _starpu_mpi_redux_data_readwrite_cl =
+{
+	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func},
+	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func},
+	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &dumb_model,
+	.name = "_starpu_mpi_redux_data_write_cl"
+};
+
+static
+void _starpu_mpi_redux_data_detached_callback(void *arg)
+{
+	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) arg;
+
+	STARPU_TASK_SET_HANDLE(args->taskB, args->new_handle, 1);
+	int ret = starpu_task_submit(args->taskB);
+	STARPU_ASSERT(ret == 0);
+
+	starpu_data_unregister_submit(args->new_handle);
+	free(args);
+}
+
+static
+void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
+{
+	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) callback_arg;
+	starpu_data_register_same(&args->new_handle, args->data_handle);
+
+	starpu_mpi_irecv_detached_sequential_consistency(args->new_handle, args->node, args->tag, args->comm, _starpu_mpi_redux_data_detached_callback, args, 0);
+}
+
+/* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
+ * a data previously accessed in REDUX mode gets accessed in R mode. */
+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	int me, rank, tag, nb_nodes;
+
+	rank = starpu_mpi_data_get_rank(data_handle);
+	tag = starpu_mpi_data_get_tag(data_handle);
+	if (rank == -1)
+	{
+		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
+	}
+	if (tag == -1)
+	{
+		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
+	}
+
+	starpu_mpi_comm_rank(comm, &me);
+	starpu_mpi_comm_size(comm, &nb_nodes);
+
+	_STARPU_MPI_DEBUG(1, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
+
+	// need to count how many nodes have the data in redux mode
+	if (me == rank)
+	{
+		int i, j=0;
+		struct starpu_task *taskBs[nb_nodes];
+
+		for(i=0 ; i<nb_nodes ; i++)
+		{
+			if (i != rank)
+			{
+				/* We need to make sure all is
+				 * executed after data_handle finished
+				 * its last read access, we hence do
+				 * the following:
+				 * - submit an empty task A reading
+				 * data_handle whose callback submits
+				 * the mpi comm with sequential
+				 * consistency set to 0, whose
+				 * callback submits the redux_cl task
+				 * B with sequential consistency set
+				 * to 0,
+				 * - submit an empty task C reading
+				 * and writing data_handle and
+				 * depending on task B, just to replug
+				 * with implicit data dependencies
+				 * with tasks inserted after this
+				 * reduction.
+				 */
+
+				struct _starpu_mpi_redux_data_args *args;
+				_STARPU_MPI_MALLOC(args, sizeof(struct _starpu_mpi_redux_data_args));
+				args->data_handle = data_handle;
+				args->tag = tag;
+				args->node = i;
+				args->comm = comm;
+
+				// We need to create taskB early as
+				// taskC declares a dependancy on it
+				args->taskB = starpu_task_create();
+				args->taskB->cl = args->data_handle->redux_cl;
+				args->taskB->sequential_consistency = 0;
+				STARPU_TASK_SET_HANDLE(args->taskB, args->data_handle, 0);
+				taskBs[j] = args->taskB; j++;
+
+				// Submit taskA
+				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
+						   STARPU_R, data_handle,
+						   STARPU_CALLBACK_WITH_ARG, _starpu_mpi_redux_data_recv_callback, args,
+						   0);
+			}
+		}
+
+		// Submit taskC which depends on all taskBs created
+		struct starpu_task *taskC = starpu_task_create();
+		taskC->cl = &_starpu_mpi_redux_data_readwrite_cl;
+		STARPU_TASK_SET_HANDLE(taskC, data_handle, 0);
+		starpu_task_declare_deps_array(taskC, j, taskBs);
+		int ret = starpu_task_submit(taskC);
+		STARPU_ASSERT(ret == 0);
+	}
+	else
+	{
+		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
+		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
+		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
+	}
+	/* FIXME: In order to prevent simultaneous receive submissions
+	 * on the same handle, we need to wait that all the starpu_mpi
+	 * tasks are done before submitting next tasks. The current
+	 * version of the implementation does not support multiple
+	 * simultaneous receive requests on the same handle.*/
+	starpu_task_wait_for_all();
+
+}

+ 32 - 0
nmad/src/starpu_mpi_task_insert.h

@@ -0,0 +1,32 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_TASK_INSERT_H__
+#define __STARPU_MPI_TASK_INSERT_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank);
+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm);
+int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __STARPU_MPI_TASK_INSERT_H__ */

+ 29 - 0
nmad/starpumpi-1.0.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011, 2016  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012  CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
+Requires: starpu-1.0
+Requires.private:

+ 29 - 0
nmad/starpumpi-1.1.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011, 2013, 2016  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012  CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
+Requires: starpu-1.1
+Requires.private:

+ 29 - 0
nmad/starpumpi-1.2.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011, 2013, 2016  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012  CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
+Requires: starpu-1.2
+Requires.private:

+ 29 - 0
nmad/starpumpi-1.3.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011, 2013, 2015-2016  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012  CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_EXPORTED_LIBS@
+Requires: starpu-1.3
+Requires.private:

+ 1 - 0
nmad/tests/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 246 - 0
nmad/tests/Makefile.am

@@ -0,0 +1,246 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012, 2016  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+include $(top_srcdir)/starpu.mk
+
+CC=$(MPICC)
+CCLD=$(MPICC)
+
+if STARPU_HAVE_WINDOWS
+LOADER_BIN		=
+else
+loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+LOADER			=	loader
+LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
+loader_SOURCES		=	../../tests/loader.c
+endif
+
+if STARPU_QUICK_CHECK
+MPI			=	$(MPIEXEC) -np 2
+else
+MPI			=	$(MPIEXEC) -np 4
+endif
+
+if STARPU_HAVE_AM111
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
+else
+TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
+endif
+
+if !STARPU_SIMGRID
+if STARPU_MPI_CHECK
+TESTS			=	$(starpu_mpi_TESTS)
+endif
+endif
+
+check_PROGRAMS = $(LOADER) $(starpu_mpi_TESTS)
+
+BUILT_SOURCES =
+
+CLEANFILES = *.gcno *.gcda *.linkinfo
+
+EXTRA_DIST = 					\
+	user_defined_datatype_value.h
+
+examplebindir = $(libdir)/starpu/examples/mpi
+
+examplebin_PROGRAMS =
+
+if STARPU_USE_CUDA
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
+
+.cu.cubin:
+	$(MKDIR_P) `dirname $@`
+	$(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
+
+.cu.o:
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
+endif
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
+
+########################
+# Unit testcases       #
+########################
+
+starpu_mpi_TESTS =				\
+	datatypes				\
+	pingpong				\
+	mpi_test				\
+	mpi_isend				\
+	mpi_irecv				\
+	mpi_isend_detached			\
+	mpi_irecv_detached			\
+	mpi_detached_tag			\
+	mpi_redux				\
+	ring					\
+	ring_sync				\
+	ring_sync_detached			\
+	ring_async				\
+	ring_async_implicit			\
+	block_interface				\
+	block_interface_pinned			\
+	cache					\
+	cache_disable				\
+	matrix					\
+	matrix2					\
+	insert_task				\
+	insert_task_cache			\
+	insert_task_compute			\
+	insert_task_sent_cache			\
+	insert_task_recv_cache			\
+	insert_task_block			\
+	insert_task_owner			\
+	insert_task_owner2			\
+	insert_task_owner_data			\
+	insert_task_count			\
+	multiple_send				\
+	mpi_scatter_gather			\
+	mpi_reduction				\
+	user_defined_datatype			\
+	comm
+
+noinst_PROGRAMS =				\
+	datatypes				\
+	pingpong				\
+	mpi_test				\
+	mpi_isend				\
+	mpi_irecv				\
+	mpi_isend_detached			\
+	mpi_irecv_detached			\
+	mpi_detached_tag			\
+	mpi_redux				\
+	ring					\
+	ring_sync				\
+	ring_sync_detached			\
+	ring_async				\
+	ring_async_implicit			\
+	block_interface				\
+	block_interface_pinned			\
+	cache					\
+	cache_disable				\
+	matrix					\
+	matrix2					\
+	insert_task				\
+	insert_task_cache			\
+	insert_task_compute			\
+	insert_task_sent_cache			\
+	insert_task_recv_cache			\
+	insert_task_block			\
+	insert_task_owner			\
+	insert_task_owner2			\
+	insert_task_owner_data			\
+	insert_task_count			\
+	multiple_send				\
+	mpi_scatter_gather			\
+	mpi_reduction				\
+	user_defined_datatype			\
+	comm
+
+mpi_isend_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_irecv_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_isend_detached_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_irecv_detached_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_detached_tag_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_redux_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+datatypes_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+pingpong_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_test_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+ring_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+ring_sync_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+ring_sync_detached_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+ring_async_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+ring_async_implicit_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+block_interface_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+block_interface_pinned_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+cache_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+cache_disable_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+matrix_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+matrix2_LDADD =					\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_cache_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_compute_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_sent_cache_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_recv_cache_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_block_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_owner_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_owner2_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_owner_data_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+insert_task_count_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+multiple_send_LDADD =				\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_scatter_gather_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+mpi_reduction_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+user_defined_datatype_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+comm_LDADD =			\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+ring_SOURCES = ring.c
+ring_sync_SOURCES = ring_sync.c
+ring_sync_detached_SOURCES = ring_sync_detached.c
+ring_async_SOURCES = ring_async.c
+ring_async_implicit_SOURCES = ring_async_implicit.c
+insert_task_count_SOURCES = insert_task_count.c
+if STARPU_USE_CUDA
+ring_SOURCES += ring_kernel.cu
+ring_sync_SOURCES += ring_kernel.cu
+ring_sync_detached_SOURCES += ring_kernel.cu
+ring_async_SOURCES += ring_kernel.cu
+ring_async_implicit_SOURCES += ring_kernel.cu
+insert_task_count_SOURCES += ring_kernel.cu
+endif
+mpi_reduction_SOURCES = mpi_reduction.c
+mpi_reduction_SOURCES += mpi_reduction_kernels.c
+user_defined_datatype_SOURCES = user_defined_datatype.c
+user_defined_datatype_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
+

+ 146 - 0
nmad/tests/block_interface.c

@@ -0,0 +1,146 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+#define NITER	2048
+
+#define BIGSIZE	128
+#define SIZE	64
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	/* Node 0 will allocate a big block and only register an inner part of
+	 * it as the block data, Node 1 will allocate a block of small size and
+	 * register it directly. Node 0 and 1 will then exchange the content of
+	 * their blocks. */
+
+	float *block = NULL;
+	starpu_data_handle_t block_handle;
+
+	if (rank == 0)
+	{
+		block = calloc(BIGSIZE*BIGSIZE*BIGSIZE, sizeof(float));
+		assert(block);
+
+		/* fill the inner block */
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] = 1.0f;
+		}
+
+		starpu_block_data_register(&block_handle, 0,
+			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
+			SIZE, SIZE, SIZE, sizeof(float));
+	}
+	else if (rank == 1)
+	{
+		block = calloc(SIZE*SIZE*SIZE, sizeof(float));
+		assert(block);
+
+		starpu_block_data_register(&block_handle, 0,
+			(uintptr_t)block, SIZE, SIZE*SIZE,
+			SIZE, SIZE, SIZE, sizeof(float));
+	}
+
+	if (rank == 0)
+	{
+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+
+		MPI_Status status;
+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		/* check the content of the block */
+		ret = starpu_data_acquire(block_handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
+		}
+		starpu_data_release(block_handle);
+
+	}
+	else if (rank == 1)
+	{
+		MPI_Status status;
+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		/* check the content of the block and modify it */
+		ret = starpu_data_acquire(block_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			assert(block[i + j*SIZE + k*SIZE*SIZE] == 1.0f);
+			block[i + j*SIZE + k*SIZE*SIZE] = 33.0f;
+		}
+		starpu_data_release(block_handle);
+
+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+	}
+
+	FPRINTF(stdout, "Rank %d is done\n", rank);
+	fflush(stdout);
+
+	if (rank == 0 || rank == 1)
+	{
+		starpu_data_unregister(block_handle);
+		free(block);
+	}
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 150 - 0
nmad/tests/block_interface_pinned.c

@@ -0,0 +1,150 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+#define NITER	2048
+
+#define BIGSIZE	64
+#define SIZE	64
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	/* Node 0 will allocate a big block and only register an inner part of
+	 * it as the block data, Node 1 will allocate a block of small size and
+	 * register it directly. Node 0 and 1 will then exchange the content of
+	 * their blocks. */
+
+	float *block;
+	starpu_data_handle_t block_handle;
+
+	if (rank == 0)
+	{
+		starpu_malloc((void **)&block,
+				BIGSIZE*BIGSIZE*BIGSIZE*sizeof(float));
+		memset(block, 0, BIGSIZE*BIGSIZE*BIGSIZE*sizeof(float));
+
+		/* fill the inner block */
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] = 1.0f;
+		}
+
+		starpu_block_data_register(&block_handle, 0,
+			(uintptr_t)block, BIGSIZE, BIGSIZE*BIGSIZE,
+			SIZE, SIZE, SIZE, sizeof(float));
+	}
+	else if (rank == 1)
+	{
+		starpu_malloc((void **)&block,
+			SIZE*SIZE*SIZE*sizeof(float));
+		memset(block, 0, SIZE*SIZE*SIZE*sizeof(float));
+
+		starpu_block_data_register(&block_handle, 0,
+			(uintptr_t)block, SIZE, SIZE*SIZE,
+			SIZE, SIZE, SIZE, sizeof(float));
+	}
+
+	if (rank == 0)
+	{
+		MPI_Status status;
+
+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+
+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		/* check the content of the block */
+		starpu_data_acquire(block_handle, STARPU_R);
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
+		}
+		starpu_data_release(block_handle);
+
+	}
+	else if (rank == 1)
+	{
+		MPI_Status status;
+
+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		/* check the content of the block and modify it */
+		ret = starpu_data_acquire(block_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
+		unsigned i, j, k;
+		for (k = 0; k < SIZE; k++)
+		for (j = 0; j < SIZE; j++)
+		for (i = 0; i < SIZE; i++)
+		{
+			assert(block[i + j*SIZE + k*SIZE*SIZE] == 1.0f);
+			block[i + j*SIZE + k*SIZE*SIZE] = 33.0f;
+		}
+		starpu_data_release(block_handle);
+
+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+
+	}
+
+	if (rank == 0 || rank == 1)
+	{
+		starpu_data_unregister(block_handle);
+		starpu_free(block);
+	}
+
+	FPRINTF(stdout, "Rank %d is done\n", rank);
+	fflush(stdout);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 110 - 0
nmad/tests/cache.c

@@ -0,0 +1,110 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+#include <starpu_mpi_cache.h>
+
+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+}
+
+struct starpu_codelet mycodelet_r =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+struct starpu_codelet mycodelet_w =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+struct starpu_codelet mycodelet_rw =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+void test(struct starpu_codelet *codelet, enum starpu_data_access_mode mode, starpu_data_handle_t data, int rank, int in_cache)
+{
+	void *ptr;
+	int ret;
+
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, codelet, mode, data, STARPU_EXECUTE_ON_NODE, 1, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
+	ptr = _starpu_mpi_cache_received_data_get(data);
+
+	if (rank == 1)
+	{
+	     if (in_cache)
+	     {
+		     STARPU_ASSERT_MSG(ptr != NULL, "Data should be in cache\n");
+	     }
+	     else
+	     {
+		     STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
+	     }
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int rank, n;
+	int ret;
+	unsigned val;
+	starpu_data_handle_t data;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	if (starpu_mpi_cache_is_enabled() == 0) goto skip;
+
+	if (rank == 0)
+		starpu_variable_data_register(&data, 0, (uintptr_t)&val, sizeof(unsigned));
+	else
+		starpu_variable_data_register(&data, -1, (uintptr_t)NULL, sizeof(unsigned));
+	starpu_mpi_data_register(data, 42, 0);
+	FPRINTF_MPI(stderr, "Registering data %p with tag %d and node %d\n", data, 42, 0);
+
+	// We use the same data with different access modes and we check if it is
+	// available or not in the cache
+	test(&mycodelet_r, STARPU_R, data, rank, 1);
+	test(&mycodelet_rw, STARPU_RW, data, rank, 0);
+	test(&mycodelet_r, STARPU_R, data, rank, 1);
+	test(&mycodelet_r, STARPU_R, data, rank, 1);
+	test(&mycodelet_w, STARPU_W, data, rank, 0);
+
+	FPRINTF(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	starpu_data_unregister(data);
+
+skip:
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return starpu_mpi_cache_is_enabled() == 0 ? STARPU_TEST_SKIPPED : 0;
+}

+ 93 - 0
nmad/tests/cache_disable.c

@@ -0,0 +1,93 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+#include <starpu_mpi_cache.h>
+
+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+}
+
+struct starpu_codelet mycodelet_r =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+int main(int argc, char **argv)
+{
+	int rank, n;
+	int ret;
+	unsigned val;
+	starpu_data_handle_t data;
+	void *ptr;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	if (starpu_mpi_cache_is_enabled() == 0) goto skip;
+
+	if (rank == 0)
+		starpu_variable_data_register(&data, 0, (uintptr_t)&val, sizeof(unsigned));
+	else
+		starpu_variable_data_register(&data, -1, (uintptr_t)NULL, sizeof(unsigned));
+	starpu_mpi_data_register(data, 42, 0);
+	FPRINTF_MPI(stderr, "Registering data %p with tag %d and node %d\n", data, 42, 0);
+
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
+
+	ptr = _starpu_mpi_cache_received_data_get(data);
+	if (rank == 1)
+	{
+	     STARPU_ASSERT_MSG(ptr != NULL, "Data should be in cache\n");
+	}
+
+	// We clean the cache
+	starpu_mpi_cache_set(0);
+
+	// We check the data is no longer in the cache
+	ptr = _starpu_mpi_cache_received_data_get(data);
+	if (rank == 1)
+	{
+	     STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
+	}
+
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r, STARPU_R, data, STARPU_EXECUTE_ON_NODE, 1, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
+	ptr = _starpu_mpi_cache_received_data_get(data);
+	if (rank == 1)
+	{
+	     STARPU_ASSERT_MSG(ptr == NULL, "Data should NOT be in cache\n");
+	}
+
+	FPRINTF(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	starpu_data_unregister(data);
+
+skip:
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return starpu_mpi_cache_is_enabled() == 0 ? STARPU_TEST_SKIPPED : 0;
+}

+ 112 - 0
nmad/tests/comm.c

@@ -0,0 +1,112 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+#include <starpu_mpi_cache.h>
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *value = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	FPRINTF_MPI(stderr, "Executing codelet with value %d\n", *value);
+	*value = *value * 2;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+int main(int argc, char **argv)
+{
+	int size;
+	int color;
+	MPI_Comm newcomm;
+	int rank, newrank;
+	int ret;
+	unsigned val = 42;
+	starpu_data_handle_t data;
+
+        MPI_Init(&argc, &argv);
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+        if (size < 4)
+        {
+                if (rank == 0)
+                        FPRINTF(stderr, "We need at least 4 processes.\n");
+
+                MPI_Finalize();
+                return STARPU_TEST_SKIPPED;
+        }
+
+	color = rank%2;
+	MPI_Comm_split(MPI_COMM_WORLD, color, rank, &newcomm);
+	MPI_Comm_rank(newcomm, &newrank);
+	FPRINTF_MPI(stderr, "[%d] color %d\n", newrank, color);
+
+        ret = starpu_init(NULL);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+        ret = starpu_mpi_init(NULL, NULL, 0);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (newrank == 0)
+	{
+		val = rank+1;
+		starpu_variable_data_register(&data, 0, (uintptr_t)&val, sizeof(val));
+	}
+	else
+		starpu_variable_data_register(&data, -1, (uintptr_t)NULL, sizeof(unsigned));
+	starpu_mpi_data_register_comm(data, 42, 0, newcomm);
+	FPRINTF_MPI(stderr, "[%d] Registering data %p with tag %d and node %d\n", newrank, data, 42, 0);
+
+	if (newrank == 0)
+	{
+		FPRINTF_MPI(stderr, "[%d] sending %d\n", newrank, rank);
+		MPI_Send(&rank, 1, MPI_INT, 1, 10, newcomm);
+		starpu_mpi_send(data, 1, 42, newcomm);
+	}
+	else
+	{
+		int x;
+		MPI_Recv(&x, 1, MPI_INT, 0, 10, newcomm, NULL);
+		FPRINTF_MPI(stderr, "[%d] received %d\n", newrank, x);
+		starpu_mpi_recv(data, 0, 42, newcomm, NULL);
+	}
+
+	starpu_mpi_insert_task(newcomm, &mycodelet,
+			       STARPU_RW, data,
+			       STARPU_EXECUTE_ON_NODE, 1,
+			       0);
+
+	FPRINTF_MPI(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	starpu_data_unregister(data);
+	if (newrank == 0)
+	{
+		FPRINTF_MPI(stderr, "[%d] new value %u\n", newrank, val);
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+        MPI_Finalize();
+	return 0;
+}

+ 333 - 0
nmad/tests/datatypes.c

@@ -0,0 +1,333 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+typedef void (*check_func)(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error);
+
+void check_void(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	FPRINTF_MPI(stderr, "Success with void value\n");
+}
+
+void check_variable(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	int ret;
+	float *v_s, *v_r;
+
+	STARPU_ASSERT(starpu_variable_get_elemsize(handle_s) == starpu_variable_get_elemsize(handle_r));
+
+	v_s = (float *)starpu_variable_get_local_ptr(handle_s);
+	v_r = (float *)starpu_variable_get_local_ptr(handle_r);
+
+	if (*v_s == *v_r)
+	{
+		FPRINTF_MPI(stderr, "Success with variable value: %f == %f\n", *v_s, *v_r);
+	}
+	else
+	{
+		*error = 1;
+		FPRINTF_MPI(stderr, "Error with variable value: %f != %f\n", *v_s, *v_r);
+	}
+}
+
+void check_vector(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	int ret, i;
+	int nx;
+	int *v_r, *v_s;
+
+	STARPU_ASSERT(starpu_vector_get_elemsize(handle_s) == starpu_vector_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_vector_get_nx(handle_s) == starpu_vector_get_nx(handle_r));
+
+	nx = starpu_vector_get_nx(handle_r);
+	v_r = (int *)starpu_vector_get_local_ptr(handle_r);
+	v_s = (int *)starpu_vector_get_local_ptr(handle_s);
+
+	for(i=0 ; i<nx ; i++)
+	{
+		if (v_s[i] == v_r[i])
+		{
+			FPRINTF_MPI(stderr, "Success with vector[%d] value: %d == %d\n", i, v_s[i], v_r[i]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with vector[%d] value: %d != %d\n", i, v_s[i], v_r[i]);
+		}
+	}
+}
+
+void check_matrix(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_matrix_get_elemsize(handle_s) == starpu_matrix_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_nx(handle_s) == starpu_matrix_get_nx(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_ny(handle_s) == starpu_matrix_get_ny(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_local_ld(handle_s) == starpu_matrix_get_local_ld(handle_r));
+
+	char *matrix_s = (char *)starpu_matrix_get_local_ptr(handle_s);
+	char *matrix_r = (char *)starpu_matrix_get_local_ptr(handle_r);
+
+	int nx = starpu_matrix_get_nx(handle_s);
+	int ny = starpu_matrix_get_ny(handle_s);
+	int ldy = starpu_matrix_get_local_ld(handle_s);
+
+	int x, y;
+
+	for(y=0 ; y<ny ; y++)
+		for(x=0 ; x<nx ; x++)
+		{
+			int index=(y*ldy)+x;
+			if (matrix_s[index] == matrix_r[index])
+			{
+				FPRINTF_MPI(stderr, "Success with matrix[%d,%d --> %d] value: %c == %c\n", x, y, index, matrix_s[index], matrix_r[index]);
+			}
+			else
+			{
+				*error = 1;
+				FPRINTF_MPI(stderr, "Error with matrix[%d,%d --> %d] value: %c != %c\n", x, y, index, matrix_s[index], matrix_r[index]);
+			}
+		}
+}
+
+void check_block(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_block_get_elemsize(handle_s) == starpu_block_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_block_get_nx(handle_s) == starpu_block_get_nx(handle_r));
+	STARPU_ASSERT(starpu_block_get_ny(handle_s) == starpu_block_get_ny(handle_r));
+	STARPU_ASSERT(starpu_block_get_nz(handle_s) == starpu_block_get_nz(handle_r));
+	STARPU_ASSERT(starpu_block_get_local_ldy(handle_s) == starpu_block_get_local_ldy(handle_r));
+	STARPU_ASSERT(starpu_block_get_local_ldz(handle_s) == starpu_block_get_local_ldz(handle_r));
+
+	float *block_s = (float *)starpu_block_get_local_ptr(handle_s);
+	float *block_r = (float *)starpu_block_get_local_ptr(handle_r);
+
+	int nx = starpu_block_get_nx(handle_s);
+	int ny = starpu_block_get_ny(handle_s);
+	int nz = starpu_block_get_nz(handle_s);
+
+	int ldy = starpu_block_get_local_ldy(handle_s);
+	int ldz = starpu_block_get_local_ldz(handle_s);
+
+	int x, y, z;
+
+	for(z=0 ; z<nz ; z++)
+		for(y=0 ; y<ny ; y++)
+			for(x=0 ; x<nx ; x++)
+			{
+				int index=(z*ldz)+(y*ldy)+x;
+				if (block_s[index] == block_r[index])
+				{
+					FPRINTF_MPI(stderr, "Success with block[%d,%d,%d --> %d] value: %f == %f\n", x, y, z, index, block_s[index], block_r[index]);
+				}
+				else
+				{
+					*error = 1;
+					FPRINTF_MPI(stderr, "Error with block[%d,%d,%d --> %d] value: %f != %f\n", x, y, z, index, block_s[index], block_r[index]);
+				}
+			}
+}
+
+void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int tag_s, starpu_data_handle_t handle_r, int tag_r, int *error, check_func func)
+{
+	int ret;
+	MPI_Status status;
+
+	if (rank == 0)
+	{
+		ret = starpu_mpi_send(handle_s, node, tag_s, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+		ret = starpu_mpi_recv(handle_r, node, tag_r, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+
+		func(handle_s, handle_r, error);
+	}
+	else
+	{
+		ret = starpu_mpi_recv(handle_s, node, tag_s, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
+		ret = starpu_mpi_send(handle_s, node, tag_r, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+	int error=0;
+
+	int nx=3;
+	int ny=2;
+	int nz=4;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (rank == 0)
+	{
+		MPI_Status status;
+
+		{
+			starpu_data_handle_t void_handle[2];
+			starpu_void_data_register(&void_handle[0]);
+			starpu_void_data_register(&void_handle[1]);
+
+			send_recv_and_check(rank, 1, void_handle[0], 0x42, void_handle[1], 0x1337, &error, check_void);
+
+			starpu_data_unregister(void_handle[0]);
+			starpu_data_unregister(void_handle[1]);
+		}
+		{
+			float v = 42.12;
+			starpu_data_handle_t variable_handle[2];
+			starpu_variable_data_register(&variable_handle[0], 0, (uintptr_t)&v, sizeof(v));
+			starpu_variable_data_register(&variable_handle[1], -1, (uintptr_t)NULL, sizeof(v));
+
+			send_recv_and_check(rank, 1, variable_handle[0], 0x42, variable_handle[1], 0x1337, &error, check_variable);
+
+			starpu_data_unregister(variable_handle[0]);
+			starpu_data_unregister(variable_handle[1]);
+		}
+
+		{
+			int vector[4] = {1, 2, 3, 4};
+			starpu_data_handle_t vector_handle[2];
+
+			starpu_vector_data_register(&vector_handle[0], 0, (uintptr_t)vector, 4, sizeof(vector[0]));
+			starpu_vector_data_register(&vector_handle[1], -1, (uintptr_t)NULL, 4, sizeof(vector[0]));
+
+			send_recv_and_check(rank, 1, vector_handle[0], 0x43, vector_handle[1], 0x2337, &error, check_vector);
+
+			starpu_data_unregister(vector_handle[0]);
+			starpu_data_unregister(vector_handle[1]);
+		}
+
+		{
+			char *matrix, n='a';
+			int x, y;
+			starpu_data_handle_t matrix_handle[2];
+
+			matrix = (char*)malloc(nx*ny*nz*sizeof(char));
+			assert(matrix);
+			for(y=0 ; y<ny ; y++)
+			{
+				for(x=0 ; x<nx ; x++)
+				{
+					matrix[(y*nx)+x] = n++;
+				}
+			}
+
+			starpu_matrix_data_register(&matrix_handle[0], 0, (uintptr_t)matrix, nx, nx, ny, sizeof(char));
+			starpu_matrix_data_register(&matrix_handle[1], -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
+
+			send_recv_and_check(rank, 1, matrix_handle[0], 0x75, matrix_handle[1], 0x8555, &error, check_matrix);
+
+			starpu_data_unregister(matrix_handle[0]);
+			starpu_data_unregister(matrix_handle[1]);
+			free(matrix);
+		}
+
+		{
+			float *block, n=1.0;
+			int x, y, z;
+			starpu_data_handle_t block_handle[2];
+
+			block = (float*)malloc(nx*ny*nz*sizeof(float));
+			assert(block);
+			for(z=0 ; z<nz ; z++)
+			{
+				for(y=0 ; y<ny ; y++)
+				{
+					for(x=0 ; x<nx ; x++)
+					{
+						block[(z*nx*ny)+(y*nx)+x] = n++;
+					}
+				}
+			}
+
+			starpu_block_data_register(&block_handle[0], 0, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
+			starpu_block_data_register(&block_handle[1], -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
+
+			send_recv_and_check(rank, 1, block_handle[0], 0x73, block_handle[1], 0x8337, &error, check_block);
+
+			starpu_data_unregister(block_handle[0]);
+			starpu_data_unregister(block_handle[1]);
+			free(block);
+		}
+	}
+	else if (rank == 1)
+	{
+		MPI_Status status;
+
+		{
+			starpu_data_handle_t void_handle;
+			starpu_void_data_register(&void_handle);
+			send_recv_and_check(rank, 0, void_handle, 0x42, NULL, 0x1337, NULL, NULL);
+			starpu_data_unregister(void_handle);
+		}
+		{
+			starpu_data_handle_t variable_handle;
+			starpu_variable_data_register(&variable_handle, -1, (uintptr_t)NULL, sizeof(float));
+			send_recv_and_check(rank, 0, variable_handle, 0x42, NULL, 0x1337, NULL, NULL);
+			starpu_data_unregister(variable_handle);
+		}
+
+		{
+			starpu_data_handle_t vector_handle;
+			starpu_vector_data_register(&vector_handle, -1, (uintptr_t)NULL, 4, sizeof(int));
+			send_recv_and_check(rank, 0, vector_handle, 0x43, NULL, 0x2337, NULL, NULL);
+			starpu_data_unregister(vector_handle);
+		}
+
+		{
+			starpu_data_handle_t matrix_handle;
+			starpu_matrix_data_register(&matrix_handle, -1, (uintptr_t)NULL, nx, nx, ny, sizeof(char));
+			send_recv_and_check(rank, 0, matrix_handle, 0x75, NULL, 0x8555, NULL, NULL);
+			starpu_data_unregister(matrix_handle);
+		}
+
+		{
+			starpu_data_handle_t block_handle;
+			starpu_block_data_register(&block_handle, -1, (uintptr_t)NULL, nx, nx*ny, nx, ny, nz, sizeof(float));
+			send_recv_and_check(rank, 0, block_handle, 0x73, NULL, 0x8337, NULL, NULL);
+			starpu_data_unregister(block_handle);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return rank == 0 ? error : 0;
+}

+ 26 - 0
nmad/tests/helper.h

@@ -0,0 +1,26 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <errno.h>
+
+#define STARPU_TEST_SKIPPED 77
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+#define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
+    						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
+                                                fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
+                                                fflush(ofile); }} while(0);
+

+ 140 - 0
nmad/tests/insert_task.c

@@ -0,0 +1,140 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	FPRINTF(stdout, "VALUES: %u %u\n", *x, *y);
+	*x = (*x + *y) / 2;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+#define X     4
+#define Y     5
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+	return x % nb_nodes;
+}
+
+
+int main(int argc, char **argv)
+{
+	int rank, size, x, y;
+	int value=0, ret;
+	unsigned matrix[X][Y];
+	starpu_data_handle_t data_handles[X][Y];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	for(x = 0; x < X; x++)
+	{
+		for (y = 0; y < Y; y++)
+		{
+			matrix[x][y] = (rank+1)*10 + value;
+			value++;
+		}
+	}
+#if 0
+	for(x = 0; x < X; x++)
+	{
+		FPRINTF(stdout, "[%d] ", rank);
+		for (y = 0; y < Y; y++)
+		{
+			FPRINTF(stdout, "%3d ", matrix[x][y]);
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	for(x = 0; x < X; x++)
+	{
+		for (y = 0; y < Y; y++)
+		{
+			int mpi_rank = my_distrib(x, y, size);
+			if (mpi_rank == rank)
+			{
+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
+			}
+			else
+			{
+				/* I don't own that index, but will need it for my computations */
+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
+			}
+			if (data_handles[x][y])
+			{
+				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
+			}
+		}
+	}
+
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
+	FPRINTF(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	for(x = 0; x < X; x++)
+	{
+		for (y = 0; y < Y; y++)
+		{
+			if (data_handles[x][y])
+				starpu_data_unregister(data_handles[x][y]);
+		}
+	}
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+#if 0
+	for(x = 0; x < X; x++)
+	{
+		FPRINTF(stdout, "[%d] ", rank);
+		for (y = 0; y < Y; y++)
+		{
+			FPRINTF(stdout, "%3d ", matrix[x][y]);
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	return 0;
+}

+ 162 - 0
nmad/tests/insert_task_block.c

@@ -0,0 +1,162 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	unsigned *matrix = (unsigned *)STARPU_MATRIX_GET_PTR(descr[0]);
+	int nx = (int)STARPU_MATRIX_GET_NX(descr[0]);
+	int ny = (int)STARPU_MATRIX_GET_NY(descr[0]);
+	int ld = (int)STARPU_MATRIX_GET_LD(descr[0]);
+
+	int i, j;
+	unsigned sum=0;
+
+	for (i = 0; i < nx; i++)
+	{
+		for (j = 0; j < ny; j++)
+		{
+			sum += matrix[i+j*ld];
+		}
+	}
+	for (i = 0; i < nx; i++)
+	{
+		for (j = 0; j < ny; j++)
+		{
+			matrix[i+j*ld] = sum;///(nx*ny);
+		}
+	}
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+#define SIZE 6
+#define BLOCKS 3
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int y, int nb_nodes)
+{
+	return x % nb_nodes;
+}
+
+
+int main(int argc, char **argv)
+{
+	int rank, size, x, y;
+	int ret, value=0;
+	unsigned matrix[SIZE*SIZE];
+	starpu_data_handle_t data_handles[SIZE][SIZE];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	for(x = 0; x < SIZE; x++)
+	{
+		for (y = 0; y < SIZE; y++)
+		{
+			matrix[x+y*SIZE] = rank*100 + value;
+			value++;
+		}
+	}
+#if 1
+	for(x = 0; x < SIZE; x++)
+	{
+		FPRINTF(stdout, "[%d] ", rank);
+		for (y = 0; y < SIZE; y++)
+		{
+			FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	for(x = 0; x < BLOCKS ; x++)
+	{
+		for (y = 0; y < BLOCKS; y++)
+		{
+			int mpi_rank = my_distrib(x, y, size);
+			if (mpi_rank == rank)
+			{
+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
+							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
+			}
+			else
+			{
+				/* I don't own that index, but will need it for my computations */
+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
+							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
+			}
+			if (data_handles[x][y])
+			{
+				starpu_mpi_data_register(data_handles[x][y], (y*BLOCKS)+x, mpi_rank);
+			}
+		}
+	}
+
+	for(x = 0; x < BLOCKS; x++)
+	{
+		for (y = 0; y < BLOCKS; y++)
+		{
+			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+						     STARPU_RW, data_handles[x][y],
+						     0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+		}
+	}
+
+	FPRINTF(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	for(x = 0; x < BLOCKS; x++)
+	{
+		for (y = 0; y < BLOCKS; y++)
+		{
+			if (data_handles[x][y])
+				starpu_data_unregister(data_handles[x][y]);
+		}
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+#if 1
+	for(x = 0; x < SIZE; x++)
+	{
+		FPRINTF(stdout, "[%d] ", rank);
+		for (y = 0; y < SIZE; y++)
+		{
+			FPRINTF(stdout, "%3u ", matrix[x+y*SIZE]);
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	return 0;
+}

+ 150 - 0
nmad/tests/insert_task_cache.c

@@ -0,0 +1,150 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/config.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+#if !defined(STARPU_HAVE_SETENV)
+#warning setenv is not defined. Skipping test
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+
+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+#define N     1000
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x)
+{
+	return x;
+}
+
+void test_cache(int rank, int size, char *enabled, size_t *comm_amount)
+{
+	int i;
+	int ret;
+	unsigned v[2][N];
+	starpu_data_handle_t data_handles[2];
+
+	setenv("STARPU_MPI_CACHE", enabled, 1);
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	for(i = 0; i < 2; i++)
+	{
+		int mpi_rank = my_distrib(i);
+		if (mpi_rank == rank)
+		{
+			//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+			starpu_vector_data_register(&data_handles[i], 0, (uintptr_t)&(v[i]), N, sizeof(unsigned));
+		}
+		else
+		{
+			/* I don't own that index, but will need it for my computations */
+			//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
+		}
+		starpu_mpi_data_register(data_handles[i], i, mpi_rank);
+	}
+
+	for(i = 0; i < 5; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	for(i = 0; i < 5; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	for(i = 0; i < 5; i++)
+	{
+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[0]);
+	}
+
+	for(i = 0; i < 5; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	starpu_task_wait_for_all();
+
+	for(i = 0; i < 2; i++)
+	{
+		starpu_data_unregister(data_handles[i]);
+	}
+
+	starpu_mpi_comm_amounts_retrieve(comm_amount);
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	int dst, rank, size;
+	int result=0;
+	size_t *comm_amount_with_cache;
+	size_t *comm_amount_without_cache;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	setenv("STARPU_COMM_STATS", "1", 1);
+
+	comm_amount_with_cache = malloc(size * sizeof(size_t));
+	comm_amount_without_cache = malloc(size * sizeof(size_t));
+
+	test_cache(rank, size, "0", comm_amount_with_cache);
+	test_cache(rank, size, "1", comm_amount_without_cache);
+
+	if (rank == 0 || rank == 1)
+	{
+		dst = (rank == 0) ? 1 : 0;
+		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
+		fprintf(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
+	}
+	else
+		result = 1;
+
+	free(comm_amount_without_cache);
+	free(comm_amount_with_cache);
+
+	MPI_Finalize();
+	return !result;
+}
+#endif

+ 142 - 0
nmad/tests/insert_task_compute.c

@@ -0,0 +1,142 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], void *_args)
+{
+	int rank;
+	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	starpu_codelet_unpack_args(_args, &rank);
+
+	FPRINTF(stdout, "[%d] VALUES: %u %u\n", rank, *x, *y);
+	*x = *x * *y;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+int test(int rank, int node, int *before, int *after, int data_array)
+{
+	int ok, ret, i, x[2];
+	starpu_data_handle_t data_handles[2];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (starpu_cpu_worker_get_count() <= 0)
+	{
+		// If there is no cpu to execute the codelet, mpi will block trying to do the post-execution communication
+		ret = -ENODEV;
+		goto nodata;
+	}
+
+	FPRINTF_MPI(stderr, "Testing with data_array=%d and node=%d\n", data_array, node);
+
+	for(i=0 ; i<2 ; i++)
+	{
+		if (rank <= 1)
+		{
+			x[i] = before[rank*2+i];
+			FPRINTF_MPI(stderr, "before computation x[%d] = %d\n", i, x[i]);
+		}
+		else
+			x[i] = rank*2+i;
+		if (rank == i)
+			starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(int));
+		else
+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
+		starpu_mpi_data_register(data_handles[i], i, i);
+	}
+
+	switch(data_array)
+	{
+		case 0:
+		{
+			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+						     STARPU_RW, data_handles[0], STARPU_R, data_handles[1],
+						     STARPU_VALUE, &rank, sizeof(rank),
+						     STARPU_EXECUTE_ON_NODE, node, 0);
+			break;
+		}
+		case 1:
+		{
+			ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+						     STARPU_DATA_ARRAY, data_handles, 2,
+						     STARPU_VALUE, &rank, sizeof(rank),
+						     STARPU_EXECUTE_ON_NODE, node, 0);
+			break;
+		}
+	}
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
+	starpu_task_wait_for_all();
+
+	for(i=0; i<2; i++)
+	{
+		starpu_data_unregister(data_handles[i]);
+	}
+
+	ok = 1;
+	if (rank <= 1)
+	{
+		for(i=0; i<2; i++)
+		{
+			ok = ok && (x[i] == after[rank*2+i]);
+			FPRINTF_MPI(stderr, "after computation x[%d] = %d, should be %d\n", i, x[i], after[rank*2+i]);
+		}
+		FPRINTF_MPI(stderr, "result is %s\n", ok?"CORRECT":"NOT CORRECT");
+	}
+
+nodata:
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return ret == -ENODEV ? ret : !ok;
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int ret;
+	int before[4] = {10, 20, 11, 22};
+	int after_node[2][4] = {{220, 20, 11, 22}, {220, 20, 11, 22}};
+	int node, insert_task, data_array;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	for(node=0 ; node<=1 ; node++)
+	{
+		for(data_array=0 ; data_array<=1 ; data_array++)
+		{
+			ret = test(rank, node, before, after_node[node], data_array);
+			if (ret == -ENODEV || ret) goto end;
+		}
+	}
+
+end:
+	MPI_Finalize();
+	return ret==-ENODEV?STARPU_TEST_SKIPPED:ret;
+}

+ 116 - 0
nmad/tests/insert_task_count.c

@@ -0,0 +1,116 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
+#endif
+
+void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda},
+#endif
+	.cpu_funcs = {increment_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+	int token = 0;
+	starpu_data_handle_t token_handle;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (rank == 1)
+		starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
+	else
+		starpu_vector_data_register(&token_handle, -1, (uintptr_t)NULL, 1, sizeof(token));
+	starpu_mpi_data_register(token_handle, 12, 1);
+
+	int nloops = NITER;
+	int loop;
+
+	FPRINTF_MPI(stderr, "Start with token value %d\n", token);
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if (loop % 2)
+			starpu_mpi_insert_task(MPI_COMM_WORLD, &increment_cl,
+					       STARPU_RW|STARPU_SSEND, token_handle,
+					       STARPU_EXECUTE_ON_NODE, 0,
+					       0);
+		else
+			starpu_mpi_insert_task(MPI_COMM_WORLD, &increment_cl,
+					       STARPU_RW, token_handle,
+					       STARPU_EXECUTE_ON_NODE, 0,
+					       0);
+	}
+
+	starpu_task_wait_for_all();
+	starpu_data_unregister(token_handle);
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	FPRINTF_MPI(stderr, "Final value for token %d\n", token);
+
+	MPI_Finalize();
+
+	if (rank == 1)
+	{
+		STARPU_ASSERT_MSG(token == nloops, "token==%d != expected_value==%d\n", token, nloops);
+	}
+	else
+	{
+		STARPU_ASSERT_MSG(token == 0, "token==%d != expected_value==0\n", token);
+
+	}
+
+	return 0;
+}

+ 169 - 0
nmad/tests/insert_task_owner.c

@@ -0,0 +1,169 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int node;
+	int rank;
+
+	starpu_codelet_unpack_args(_args, &node);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	FPRINTF(stderr, "Expected node: %d - Actual node: %d\n", node, rank);
+
+	assert(node == rank);
+}
+
+struct starpu_codelet mycodelet_r_w =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+struct starpu_codelet mycodelet_rw_r =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+struct starpu_codelet mycodelet_rw_rw =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
+};
+
+struct starpu_codelet mycodelet_w_r =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R}
+};
+
+struct starpu_codelet mycodelet_r_r =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_R}
+};
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size, err, node;
+	int x0=32, x1=23;
+	starpu_data_handle_t data_handlesx0;
+	starpu_data_handle_t data_handlesx1;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (rank != 0 && rank != 1) goto end;
+
+	if (rank == 0)
+	{
+		starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
+		starpu_variable_data_register(&data_handlesx1, -1, (uintptr_t)NULL, sizeof(int));
+	}
+	else if (rank == 1)
+	{
+		starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
+		starpu_variable_data_register(&data_handlesx0, -1, (uintptr_t)NULL, sizeof(int));
+	}
+	starpu_mpi_data_register(data_handlesx0, 0, 0);
+	starpu_mpi_data_register(data_handlesx1, 1, 1);
+
+	node = starpu_mpi_data_get_rank(data_handlesx1);
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
+				     0);
+	assert(err == 0);
+
+	node = starpu_mpi_data_get_rank(data_handlesx0);
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
+				     0);
+	assert(err == 0);
+
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
+				     0);
+	assert(err == -EINVAL);
+
+	node = 1;
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+	assert(err == 0);
+
+	node = 0;
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+	assert(err == 0);
+
+	node = 0;
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+	assert(err == 0);
+
+	/* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
+	   going to overwrite the node even though the data model clearly specifies
+	   which node is going to execute the codelet */
+	node = 0;
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+	assert(err == 0);
+
+	/* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
+	   going to overwrite the node even though the data model clearly specifies
+	   which node is going to execute the codelet */
+	node = 0;
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+	assert(err == 0);
+
+	fprintf(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+	starpu_data_unregister(data_handlesx0);
+	starpu_data_unregister(data_handlesx1);
+
+end:
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}
+

+ 126 - 0
nmad/tests/insert_task_owner2.c

@@ -0,0 +1,126 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	int *x2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
+	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[3]);
+
+	//FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+	//*x2 = 45;
+	//*y = 144;
+
+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d (x2) %d\n", *x0, *x1, *y);
+	*y = (*x0 + *x1) * 100;
+	*x1 = 12;
+	*x2 = 24;
+	*x0 = 36;
+	FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
+};
+
+int main(int argc, char **argv)
+{
+	int rank, size, err;
+	int x[3], y=0;
+	int i, ret;
+	starpu_data_handle_t data_handles[4];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (rank == 0)
+	{
+		for(i=0 ; i<3 ; i++)
+		{
+			x[i] = 10*(i+1);
+			starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
+		}
+		y = -1;
+		starpu_variable_data_register(&data_handles[3], -1, (uintptr_t)NULL, sizeof(int));
+	}
+	else if (rank == 1)
+	{
+		for(i=0 ; i<3 ; i++)
+		{
+			x[i] = -1;
+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
+		}
+		y=200;
+		starpu_variable_data_register(&data_handles[3], 0, (uintptr_t)&y, sizeof(int));
+	}
+	else
+	{
+		for(i=0 ; i<4 ; i++)
+			starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
+	}
+	FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
+
+	for(i=0 ; i<3 ; i++)
+	{
+		starpu_mpi_data_register(data_handles[i], i, 0);
+	}
+	starpu_mpi_data_register(data_handles[3], 3, 1);
+
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+				     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
+				     STARPU_W, data_handles[2],
+				     STARPU_W, data_handles[3],
+				     STARPU_EXECUTE_ON_NODE, 1, 0);
+	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
+	starpu_task_wait_for_all();
+
+	int *values = malloc(4 * sizeof(int));
+	for(i=0 ; i<4 ; i++)
+	{
+		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
+		if (rank == 0)
+		{
+			starpu_data_acquire(data_handles[i], STARPU_R);
+			values[i] = *((int *)starpu_data_get_local_ptr(data_handles[i]));
+			starpu_data_release(data_handles[i]);
+		}
+		starpu_data_unregister(data_handles[i]);
+	}
+	if (rank == 0)
+	{
+		FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
+	}
+        FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
+
+	free(values);
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}
+

+ 107 - 0
nmad/tests/insert_task_owner_data.c

@@ -0,0 +1,107 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	int *x1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	*x0 += 1;
+	*x1 *= *x1;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
+};
+
+int main(int argc, char **argv)
+{
+	int rank, size, err;
+	int x[2];
+	int ret, i;
+	starpu_data_handle_t data_handles[2];
+	int values[2];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (rank == 0)
+	{
+		x[0] = 11;
+		starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x[0], sizeof(x[0]));
+		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
+	}
+	else if (rank == 1)
+	{
+		x[1] = 12;
+		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
+		starpu_variable_data_register(&data_handles[1], 0, (uintptr_t)&x[1], sizeof(x[1]));
+	}
+	else
+	{
+		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
+		starpu_variable_data_register(&data_handles[1], -1, (uintptr_t)NULL, sizeof(x[1]));
+	}
+
+	starpu_mpi_data_register(data_handles[0], 0, 0);
+	starpu_mpi_data_register(data_handles[1], 1, 1);
+
+	err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+				     STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
+				     STARPU_EXECUTE_ON_DATA, data_handles[1],
+				     0);
+	assert(err == 0);
+	starpu_task_wait_for_all();
+
+	for(i=0 ; i<2 ; i++)
+	{
+		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[i], 0, NULL, NULL);
+		if (rank == 0)
+		{
+			starpu_data_acquire(data_handles[i], STARPU_R);
+			values[i] = *((int *)starpu_data_get_local_ptr(data_handles[i]));
+			starpu_data_release(data_handles[i]);		}
+	}
+	ret = 0;
+	if (rank == 0)
+	{
+		FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
+		if (values[0] != 12 || values[1] != 144)
+		{
+			ret = EXIT_FAILURE;
+		}
+	}
+
+	starpu_data_unregister(data_handles[0]);
+	starpu_data_unregister(data_handles[1]);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return ret;
+}
+

+ 144 - 0
nmad/tests/insert_task_recv_cache.c

@@ -0,0 +1,144 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/config.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+#if !defined(STARPU_HAVE_SETENV)
+#warning setenv is not defined. Skipping test
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+
+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+#define N     1000
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x)
+{
+	return x;
+}
+
+void test_cache(int rank, int size, char *enabled, size_t *comm_amount)
+{
+	int i;
+	int ret;
+	unsigned v[2][N];
+	starpu_data_handle_t data_handles[2];
+
+	FPRINTF_MPI(stderr, "Testing with STARPU_MPI_CACHE=%s\n", enabled);
+	setenv("STARPU_MPI_CACHE", enabled, 1);
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	for(i = 0; i < 2; i++)
+	{
+		int mpi_rank = my_distrib(i);
+		if (mpi_rank == rank)
+		{
+			starpu_vector_data_register(&data_handles[i], 0, (uintptr_t)&(v[i]), N, sizeof(unsigned));
+		}
+		else
+		{
+			/* I don't own that index, but will need it for my computations */
+			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
+		}
+		starpu_mpi_data_register(data_handles[i], i, mpi_rank);
+	}
+
+	// We call starpu_mpi_insert_task twice, when the cache is enabled, the 1st time puts the
+	// data in the cache, the 2nd time allows to check the data is not sent again
+	for(i = 0; i < 2; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	// Flush the cache for data_handles[1] which has been sent from node1 to node0
+	starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[1]);
+
+	// Check again
+	for(i = 0; i < 2; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	starpu_task_wait_for_all();
+
+	for(i = 0; i < 2; i++)
+	{
+		starpu_data_unregister(data_handles[i]);
+	}
+
+	starpu_mpi_comm_amounts_retrieve(comm_amount);
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	int rank, size;
+	int result=0;
+	size_t *comm_amount_with_cache;
+	size_t *comm_amount_without_cache;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	setenv("STARPU_COMM_STATS", "1", 1);
+	setenv("STARPU_MPI_CACHE_STATS", "1", 1);
+
+	comm_amount_with_cache = malloc(size * sizeof(size_t));
+	comm_amount_without_cache = malloc(size * sizeof(size_t));
+
+	test_cache(rank, size, "0", comm_amount_with_cache);
+	test_cache(rank, size, "1", comm_amount_without_cache);
+
+	if (rank == 1)
+	{
+		result = (comm_amount_with_cache[0] == comm_amount_without_cache[0] * 2);
+		FPRINTF_MPI(stderr, "Communication cache mechanism is %sworking (with cache: %ld) (without cache: %ld)\n", result?"":"NOT ", comm_amount_with_cache[0], comm_amount_without_cache[0]);
+	}
+	else
+		result = 1;
+
+	free(comm_amount_without_cache);
+	free(comm_amount_with_cache);
+
+	MPI_Finalize();
+	return !result;
+}
+#endif

+ 150 - 0
nmad/tests/insert_task_sent_cache.c

@@ -0,0 +1,150 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/config.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+#if !defined(STARPU_HAVE_SETENV)
+#warning setenv is not defined. Skipping test
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+
+void func_cpu(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+#define N     1000
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x)
+{
+	return x;
+}
+
+void test_cache(int rank, int size, char *enabled, size_t *comm_amount)
+{
+	int i;
+	int ret;
+	unsigned v[2][N];
+	starpu_data_handle_t data_handles[2];
+
+	setenv("STARPU_MPI_CACHE", enabled, 1);
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	for(i = 0; i < 2; i++)
+	{
+		int mpi_rank = my_distrib(i);
+		if (mpi_rank == rank)
+		{
+			//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+			starpu_vector_data_register(&data_handles[i], 0, (uintptr_t)&(v[i]), N, sizeof(unsigned));
+		}
+		else
+		{
+			/* I don't own that index, but will need it for my computations */
+			//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
+		}
+		starpu_mpi_data_register(data_handles[i], i, mpi_rank);
+	}
+
+	for(i = 0; i < 5; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0], STARPU_R, data_handles[1], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	for(i = 0; i < 5; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	for(i = 0; i < 5; i++)
+	{
+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[0]);
+	}
+
+	for(i = 0; i < 5; i++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1], STARPU_R, data_handles[0], 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	starpu_task_wait_for_all();
+
+	for(i = 0; i < 2; i++)
+	{
+		starpu_data_unregister(data_handles[i]);
+	}
+
+	starpu_mpi_comm_amounts_retrieve(comm_amount);
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	int dst, rank, size;
+	int result=0;
+	size_t *comm_amount_with_cache;
+	size_t *comm_amount_without_cache;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	setenv("STARPU_COMM_STATS", "1", 1);
+
+	comm_amount_with_cache = malloc(size * sizeof(size_t));
+	comm_amount_without_cache = malloc(size * sizeof(size_t));
+
+	test_cache(rank, size, "0", comm_amount_with_cache);
+	test_cache(rank, size, "1", comm_amount_without_cache);
+
+	if (rank == 0 || rank == 1)
+	{
+		dst = (rank == 0) ? 1 : 0;
+		result = (comm_amount_with_cache[dst] == comm_amount_without_cache[dst] * 5);
+		FPRINTF_MPI(stderr, "Communication cache mechanism is %sworking\n", result?"":"NOT ");
+	}
+	else
+		result = 1;
+
+	free(comm_amount_without_cache);
+	free(comm_amount_with_cache);
+
+	MPI_Finalize();
+	return !result;
+}
+#endif

+ 132 - 0
nmad/tests/matrix.c

@@ -0,0 +1,132 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	unsigned *A = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *X = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	unsigned *Y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[2]);
+
+	FPRINTF_MPI(stderr, "VALUES: Y=%3u A=%3u X=%3u\n", *Y, *A, *X);
+	*Y = *Y + *A * *X;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW}
+};
+
+#define N 4
+
+int main(int argc, char **argv)
+{
+	int rank, n;
+	int ret;
+	unsigned A[N];
+	unsigned X[N];
+	unsigned Y;
+	starpu_data_handle_t data_A[N];
+	starpu_data_handle_t data_X[N];
+	starpu_data_handle_t data_Y;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	for(n = 0; n < N; n++)
+	{
+		A[n] = (n+1)*10;
+		X[n] = n+1;
+	}
+	Y = 0;
+
+	FPRINTF_MPI(stderr, "A = ");
+	for(n = 0; n < N; n++)
+	{
+		FPRINTF(stderr, "%u ", A[n]);
+	}
+	FPRINTF(stderr, "\n");
+	FPRINTF_MPI(stderr, "X = ");
+	for(n = 0; n < N; n++)
+	{
+		FPRINTF(stderr, "%u ", X[n]);
+	}
+	FPRINTF(stderr, "\n");
+
+	for(n = 0; n < N; n++)
+	{
+		if (rank == n%2)
+			starpu_variable_data_register(&data_A[n], 0, (uintptr_t)&A[n], sizeof(unsigned));
+		else
+			starpu_variable_data_register(&data_A[n], -1, (uintptr_t)NULL, sizeof(unsigned));
+		starpu_mpi_data_register_comm(data_A[n], n+100, n%2, MPI_COMM_WORLD);
+		FPRINTF_MPI(stderr, "Registering A[%d] to %p with tag %d and node %d\n", n, data_A[n], n+100, n%2);
+
+		if (rank == n%2)
+			starpu_variable_data_register(&data_X[n], 0, (uintptr_t)&X[n], sizeof(unsigned));
+		else
+			starpu_variable_data_register(&data_X[n], -1, (uintptr_t)NULL, sizeof(unsigned));
+		starpu_mpi_data_register_comm(data_X[n], n+200, n%2, MPI_COMM_WORLD);
+		FPRINTF_MPI(stderr, "Registering X[%d] to %p with tag %d and node %d\n", n, data_X[n], n+200, n%2);
+	}
+	if (rank == 0)
+		starpu_variable_data_register(&data_Y, 0, (uintptr_t)&Y, sizeof(unsigned));
+	else
+		starpu_variable_data_register(&data_Y, -1, (uintptr_t)NULL, sizeof(unsigned));
+	starpu_mpi_data_register_comm(data_Y, 10, 0, MPI_COMM_WORLD);
+	FPRINTF_MPI(stderr, "Registering Y to %p with tag %d and node %d\n", data_Y, 10, 0);
+
+	for(n = 0; n < N; n++)
+	{
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+					     STARPU_R, data_A[n],
+					     STARPU_R, data_X[n],
+					     STARPU_RW, data_Y,
+					     STARPU_EXECUTE_ON_DATA, data_A[n],
+					     0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
+	}
+
+	FPRINTF(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	for(n = 0; n < N; n++)
+	{
+		starpu_data_unregister(data_A[n]);
+		starpu_data_unregister(data_X[n]);
+	}
+	starpu_data_unregister(data_Y);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	FPRINTF(stdout, "[%d] Y=%u\n", rank, Y);
+
+	if (rank == 0)
+	{
+		STARPU_ASSERT_MSG(Y==300, "Error when calculating Y=%u\n", Y);
+	}
+
+	return 0;
+}

+ 140 - 0
nmad/tests/matrix2.c

@@ -0,0 +1,140 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	unsigned *A = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *X = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	unsigned *Y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[2]);
+
+	FPRINTF_MPI(stderr, "VALUES: Y=%3u A=%3u X=%3u\n", *Y, *A, *X);
+	*Y = *Y + *A * *X;
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW}
+};
+
+#define N 4
+
+int main(int argc, char **argv)
+{
+	int rank, size;
+	int n;
+	int ret;
+	unsigned A[N];
+	unsigned X[N];
+	starpu_data_handle_t data_A[N];
+	starpu_data_handle_t data_X[N];
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 3)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 3 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	for(n = 0; n < N; n++)
+	{
+		A[n] = (n+1)*10;
+		X[n] = n+1;
+	}
+
+	FPRINTF_MPI(stderr, "A = ");
+	for(n = 0; n < N; n++)
+	{
+		FPRINTF(stderr, "%u ", A[n]);
+	}
+	FPRINTF(stderr, "\n");
+	FPRINTF_MPI(stderr, "X = ");
+	for(n = 0; n < N; n++)
+	{
+		FPRINTF(stderr, "%u ", X[n]);
+	}
+	FPRINTF(stderr, "\n");
+
+	for(n = 0; n < N; n++)
+	{
+		if (rank == n%2)
+			starpu_variable_data_register(&data_A[n], 0, (uintptr_t)&A[n], sizeof(unsigned));
+		else
+			starpu_variable_data_register(&data_A[n], -1, (uintptr_t)NULL, sizeof(unsigned));
+		starpu_mpi_data_register(data_A[n], n+100, n%2);
+		FPRINTF_MPI(stderr, "Registering A[%d] to %p with tag %d and node %d\n", n,data_A[n], n+100, n%2);
+	}
+
+	for(n = 0; n < N; n++)
+	{
+		if (rank == 2)
+			starpu_variable_data_register(&data_X[n], 0, (uintptr_t)&X[n], sizeof(unsigned));
+		else
+			starpu_variable_data_register(&data_X[n], -1, (uintptr_t)NULL, sizeof(unsigned));
+		starpu_mpi_data_register(data_X[n], n+200, 2);
+		FPRINTF_MPI(stderr, "Registering X[%d] to %p with tag %d and node %d\n", n, data_X[n], n+200, 2);
+	}
+
+	for(n = 0; n < N-1; n++)
+	{
+	     fprintf(stderr, "loop %d\n", n);
+		ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+					     STARPU_R, data_A[n],
+					     STARPU_R, data_X[n],
+					     STARPU_RW, data_X[N-1],
+					     STARPU_EXECUTE_ON_DATA, data_A[n],
+					     0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+	}
+
+	FPRINTF(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	for(n = 0; n < N; n++)
+	{
+		starpu_data_unregister(data_A[n]);
+		starpu_data_unregister(data_X[n]);
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	FPRINTF(stdout, "[%d] X[%d]=%u\n", rank, N-1, X[N-1]);
+
+	if (rank == 2)
+	{
+		STARPU_ASSERT_MSG(X[N-1]==144, "Error when calculating X[N-1]=%u\n", X[N-1]);
+	}
+
+	MPI_Finalize();
+	return 0;
+}

+ 86 - 0
nmad/tests/mpi_detached_tag.c

@@ -0,0 +1,86 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	int nloops = NITER;
+	int loop;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		starpu_tag_t tag = (starpu_tag_t)loop;
+
+		if ((loop % 2) == (rank%2))
+		{
+			starpu_mpi_isend_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
+		}
+		else
+		{
+			starpu_mpi_irecv_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
+		}
+
+		starpu_tag_wait(tag);
+	}
+
+	starpu_data_unregister(tab_handle);
+	free(tab);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 85 - 0
nmad/tests/mpi_irecv.c

@@ -0,0 +1,85 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	int nloops = NITER;
+	int loop;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if ((loop % 2) == (rank%2))
+		{
+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_req req;
+			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+			starpu_mpi_wait(&req, &status);
+		}
+	}
+
+	starpu_data_unregister(tab_handle);
+	free(tab);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 103 - 0
nmad/tests/mpi_irecv_detached.c

@@ -0,0 +1,103 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <common/thread.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
+
+void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	unsigned *received = arg;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	*received = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+}
+
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	int nloops = NITER;
+	int loop;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if ((loop % 2) == (rank%2))
+		{
+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
+		}
+		else
+		{
+			int received = 0;
+			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
+
+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+			while (!received)
+				STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		}
+	}
+
+	starpu_data_unregister(tab_handle);
+	free(tab);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 86 - 0
nmad/tests/mpi_isend.c

@@ -0,0 +1,86 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	int nloops = NITER;
+	int loop;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if ((loop % 2) == (rank%2))
+		{
+			MPI_Status status;
+			starpu_mpi_req req;
+			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+			starpu_mpi_wait(&req, &status);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
+		}
+	}
+
+	starpu_data_unregister(tab_handle);
+	free(tab);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 108 - 0
nmad/tests/mpi_isend_detached.c

@@ -0,0 +1,108 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <common/thread.h>
+
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+#define SIZE	16
+
+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
+
+void callback(void *arg)
+{
+	unsigned *completed = arg;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	*completed = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+	float *tab;
+	starpu_data_handle_t tab_handle;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	int nloops = NITER;
+	int loop;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if ((loop % 2) == (rank%2))
+		{
+			int sent = 0;
+			starpu_mpi_isend_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &sent);
+
+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+			while (!sent)
+				STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		}
+		else
+		{
+			int received = 0;
+			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
+
+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+			while (!received)
+				STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+		}
+	}
+
+	starpu_data_unregister(tab_handle);
+	free(tab);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 173 - 0
nmad/tests/mpi_reduction.c

@@ -0,0 +1,173 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Université de Bordeaux
+ * Copyright (C) 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+
+extern void init_cpu_func(void *descr[], void *cl_arg);
+extern void redux_cpu_func(void *descr[], void *cl_arg);
+extern void dot_cpu_func(void *descr[], void *cl_arg);
+extern void display_cpu_func(void *descr[], void *cl_arg);
+
+static struct starpu_codelet init_codelet =
+{
+	.cpu_funcs = {init_cpu_func},
+	.nbuffers = 1,
+	.modes = {STARPU_W},
+	.name = "init_codelet"
+};
+
+static struct starpu_codelet redux_codelet =
+{
+	.cpu_funcs = {redux_cpu_func},
+	.modes = {STARPU_RW, STARPU_R},
+	.nbuffers = 2,
+	.name = "redux_codelet"
+};
+
+static struct starpu_codelet dot_codelet =
+{
+	.cpu_funcs = {dot_cpu_func},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_REDUX},
+	.name = "dot_codelet"
+};
+
+static struct starpu_codelet display_codelet =
+{
+	.cpu_funcs = {display_cpu_func},
+	.nbuffers = 1,
+	.modes = {STARPU_R},
+	.name = "display_codelet"
+};
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int nb_nodes)
+{
+	return x % nb_nodes;
+}
+
+int main(int argc, char **argv)
+{
+	int my_rank, size, x, y, i;
+	long int *vector;
+	long int dot, sum=0;
+	starpu_data_handle_t *handles;
+	starpu_data_handle_t dot_handle;
+
+	int nb_elements, step, loops;
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	nb_elements = size*8000;
+	step = 4;
+	loops = 5;
+
+	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
+	for(x = 0; x < nb_elements; x+=step)
+	{
+		int mpi_rank = my_distrib(x/step, size);
+		if (mpi_rank == my_rank)
+		{
+			for(y=0 ; y<step ; y++)
+			{
+				vector[x+y] = x+y+1;
+			}
+		}
+	}
+	if (my_rank == 0)
+	{
+		dot = 14;
+		sum = (nb_elements * (nb_elements + 1)) / 2;
+		sum *= loops;
+		sum += dot;
+		starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(dot));
+	}
+	else
+	{
+		starpu_variable_data_register(&dot_handle, -1, (uintptr_t)NULL, sizeof(dot));
+	}
+
+
+	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
+	for(x = 0; x < nb_elements; x+=step)
+	{
+		int mpi_rank = my_distrib(x/step, size);
+		if (mpi_rank == my_rank)
+		{
+			/* Owning data */
+			starpu_vector_data_register(&handles[x], 0, (uintptr_t)&(vector[x]), step, sizeof(vector[0]));
+		}
+		else
+		{
+			starpu_vector_data_register(&handles[x], -1, (uintptr_t)NULL, step, sizeof(vector[0]));
+		}
+		if (handles[x])
+		{
+			starpu_mpi_data_register(handles[x], x, mpi_rank);
+		}
+	}
+
+	starpu_mpi_data_register(dot_handle, nb_elements+1, 0);
+	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
+
+	for (i = 0; i < loops; i++)
+	{
+		for (x = 0; x < nb_elements; x+=step)
+		{
+			starpu_mpi_insert_task(MPI_COMM_WORLD,
+					       &dot_codelet,
+					       STARPU_R, handles[x],
+					       STARPU_REDUX, dot_handle,
+					       0);
+		}
+		starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
+		starpu_mpi_insert_task(MPI_COMM_WORLD, &display_codelet, STARPU_R, dot_handle, 0);
+	}
+
+	fprintf(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+
+	for(x = 0; x < nb_elements; x+=step)
+	{
+		if (handles[x]) starpu_data_unregister(handles[x]);
+	}
+	if (dot_handle)
+	{
+		starpu_data_unregister(dot_handle);
+	}
+	free(vector);
+	free(handles);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	if (my_rank == 0)
+	{
+		fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
+		fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
+		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
+	}
+
+	return 0;
+}
+

+ 76 - 0
nmad/tests/mpi_reduction_kernels.c

@@ -0,0 +1,76 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <mpi.h>
+
+#define _DISPLAY(fmt, ...) do { \
+		int _display_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_display_rank);	\
+		fprintf(stderr, "[%d][%s] " fmt , _display_rank, __starpu_func__ ,## __VA_ARGS__); 	\
+		fflush(stderr); } while(0)
+
+/*
+ *	Codelet to create a neutral element
+ */
+void init_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	*dot = 0;
+	_DISPLAY("Init dot\n");
+}
+
+/*
+ *	Codelet to perform the reduction of two elements
+ */
+void redux_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *dota = (long int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	long int *dotb = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	*dota = *dota + *dotb;
+	_DISPLAY("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
+}
+
+/*
+ *	Dot product codelet
+ */
+void dot_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	long int *dot = (long int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+//	_DISPLAY("Before dot=%ld (adding %d elements...)\n", *dot, n);
+	unsigned i;
+	for (i = 0; i < n; i++)
+	{
+//		_DISPLAY("Adding %ld\n", local_x[i]);
+		*dot += local_x[i];
+	}
+//	_DISPLAY("After dot=%ld\n", *dot);
+}
+
+/*
+ *	Display codelet
+ */
+void display_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *local_x = (long int *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	_DISPLAY("Local=%ld\n", *local_x);
+}
+

+ 94 - 0
nmad/tests/mpi_redux.c

@@ -0,0 +1,94 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
+
+void callback(void *arg)
+{
+	unsigned *received = arg;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	*received = *received + 1;
+	fprintf(stderr, "received = %d\n", *received);
+	STARPU_PTHREAD_COND_SIGNAL(&cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+	int value=0;
+	starpu_data_handle_t *handles;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (rank == 0)
+	{
+		int src, sum;
+		int received = 1;
+
+		handles = malloc(size * sizeof(starpu_data_handle_t));
+
+		for(src=1 ; src<size ; src++)
+		{
+			starpu_variable_data_register(&handles[src], -1, (uintptr_t)NULL, sizeof(int));
+			starpu_mpi_irecv_detached(handles[src], src, 12, MPI_COMM_WORLD, callback, &received);
+		}
+
+		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		while (received != size)
+			STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+
+		for(src=1 ; src<size ; src++)
+		{
+			void *ptr = starpu_data_get_local_ptr(handles[src]);
+			value += *((int *)ptr);
+			starpu_data_unregister(handles[src]);
+		}
+		sum = ((size-1) * (size) / 2);
+		STARPU_ASSERT_MSG(sum == value, "Sum of first %d integers is %d, not %d\n", size-1, sum, value);
+	}
+	else
+	{
+		value = rank;
+		handles = malloc(sizeof(starpu_data_handle_t));
+		starpu_variable_data_register(&handles[0], 0, (uintptr_t)&value, sizeof(int));
+		starpu_mpi_send(handles[0], 0, 12, MPI_COMM_WORLD);
+		starpu_data_unregister_submit(handles[0]);
+	}
+
+	starpu_task_wait_for_all();
+	free(handles);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 191 - 0
nmad/tests/mpi_scatter_gather.c

@@ -0,0 +1,191 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int nb_nodes)
+{
+	return x % nb_nodes;
+}
+
+void cpu_codelet(void *descr[], void *_args)
+{
+	int *vector = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
+	unsigned i;
+	int rank;
+
+	starpu_codelet_unpack_args(_args, &rank);
+	for (i = 0; i < nx; i++)
+	{
+		//fprintf(stderr,"rank %d v[%d] = %d\n", rank, i, vector[i]);
+		vector[i] *= rank+2;
+	}
+}
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs = {cpu_codelet},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+};
+
+void scallback(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	char *msg = arg;
+	FPRINTF_MPI(stderr, "Sending completed for <%s>\n", msg);
+}
+
+void rcallback(void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	char *msg = arg;
+	FPRINTF_MPI(stderr, "Reception completed for <%s>\n", msg);
+}
+
+int main(int argc, char **argv)
+{
+	int rank, nodes, ret, x;
+	int *vector = NULL;
+	starpu_data_handle_t *data_handles;
+	int size=10;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+
+	if (rank == 0)
+	{
+		/* Allocate the vector */
+		vector = malloc(size * sizeof(int));
+		for(x=0 ; x<size ; x++)
+		{
+			vector[x] = x+10;
+		}
+
+		// Print vector
+		FPRINTF_MPI(stderr, " Input vector: ");
+		for(x=0 ; x<size ; x++)
+		{
+			FPRINTF(stderr, "%d\t", vector[x]);
+		}
+		FPRINTF(stderr,"\n");
+	}
+
+	/* Allocate data handles and register data to StarPU */
+	data_handles = (starpu_data_handle_t *) calloc(size, sizeof(starpu_data_handle_t));
+	for(x = 0; x < size ; x++)
+	{
+		int mpi_rank = my_distrib(x, nodes);
+		if (rank == 0)
+		{
+			starpu_vector_data_register(&data_handles[x], 0, (uintptr_t)&vector[x], 1, sizeof(int));
+		}
+		else if ((mpi_rank == rank))
+		{
+			/* I do not own that index but i will need it for my computations */
+			starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL, 1, sizeof(int));
+		}
+		else
+		{
+			/* I know it's useless to allocate anything for this */
+			data_handles[x] = NULL;
+		}
+		if (data_handles[x])
+		{
+			starpu_mpi_data_register(data_handles[x], x, 0);
+		}
+	}
+
+	/* Scatter the matrix among the nodes */
+	for(x = 0; x < size ; x++)
+	{
+		if (data_handles[x])
+		{
+			int mpi_rank = my_distrib(x, nodes);
+			starpu_mpi_data_set_rank(data_handles[x], mpi_rank);
+		}
+	}
+	starpu_mpi_scatter_detached(data_handles, size, 0, MPI_COMM_WORLD, scallback, "scatter", NULL, NULL);
+
+	/* Calculation */
+	for(x = 0; x < size ; x++)
+	{
+		if (data_handles[x])
+		{
+			int owner = starpu_mpi_data_get_rank(data_handles[x]);
+			if (owner == rank)
+			{
+				FPRINTF_MPI(stderr,"Computing on data[%d]\n", x);
+				starpu_insert_task(&cl,
+						   STARPU_VALUE, &rank, sizeof(rank),
+						   STARPU_RW, data_handles[x],
+						   0);
+			}
+		}
+	}
+
+	/* Gather the matrix on main node */
+	starpu_mpi_gather_detached(data_handles, size, 0, MPI_COMM_WORLD, scallback, "gather", rcallback, "gather");
+	for(x = 0; x < size ; x++)
+	{
+		if (data_handles[x])
+		{
+			starpu_mpi_data_set_rank(data_handles[x], 0);
+		}
+	}
+
+	/* Unregister matrix from StarPU */
+	for(x=0 ; x<size ; x++)
+	{
+		if (data_handles[x])
+		{
+			starpu_data_unregister(data_handles[x]);
+		}
+	}
+
+	// Print vector
+	if (rank == 0)
+	{
+		FPRINTF_MPI(stderr, "Output vector: ");
+		for(x=0 ; x<size ; x++)
+		{
+			FPRINTF(stderr, "%d\t", vector[x]);
+		}
+		FPRINTF(stderr,"\n");
+		for(x=0 ; x<size ; x++)
+		{
+			int mpi_rank = my_distrib(x, nodes);
+			if (vector[x] != (x+10) * (mpi_rank+2))
+			{
+				FPRINTF_MPI(stderr, "Incorrect value for vector[%d]. computed %d != expected %d\n", x, vector[x], (x+10) * (mpi_rank+2));
+				ret = 1;
+			}
+		}
+		free(vector);
+	}
+
+	// Free memory
+	free(data_handles);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+	return (rank == 0) ? ret : 0;
+}

+ 93 - 0
nmad/tests/mpi_test.c

@@ -0,0 +1,93 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	int nloops = NITER;
+	int loop;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		starpu_mpi_req req;
+
+		if ((loop % 2) == (rank%2))
+		{
+			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+		}
+		else
+		{
+			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
+		}
+
+		int finished = 0;
+		do
+		{
+			MPI_Status status;
+			starpu_mpi_test(&req, &finished, &status);
+		}
+		while (!finished);
+	}
+
+	starpu_data_unregister(tab_handle);
+	free(tab);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	return 0;
+}

+ 97 - 0
nmad/tests/multiple_send.c

@@ -0,0 +1,97 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+	unsigned send[2] = {42, 11};
+	unsigned recv[2] = {33, 33};
+	starpu_mpi_req req[2];
+	starpu_data_handle_t send_handle[2];
+	starpu_data_handle_t recv_handle[2];
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		starpu_mpi_shutdown();
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	starpu_variable_data_register(&send_handle[0], 0, (uintptr_t)&send[0], sizeof(unsigned));
+	starpu_variable_data_register(&send_handle[1], 0, (uintptr_t)&send[1], sizeof(unsigned));
+	starpu_variable_data_register(&recv_handle[0], 0, (uintptr_t)&recv[0], sizeof(unsigned));
+	starpu_variable_data_register(&recv_handle[1], 0, (uintptr_t)&recv[1], sizeof(unsigned));
+
+	if (rank == 0)
+	{
+		starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
+		starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
+	}
+	else if (rank == 1)
+	{
+		starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
+		starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
+	}
+
+	if (rank == 0 || rank == 1)
+	{
+		int nb_req=2;
+		while (nb_req)
+		{
+			int r=0;
+			for(r=0 ; r<2 ; r++)
+			{
+				if (req[r])
+				{
+					int finished = 0;
+					MPI_Status status;
+					starpu_mpi_test(&req[r], &finished, &status);
+					STARPU_ASSERT(finished != -1);
+					if (finished)
+					{
+						FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
+						req[r] = NULL;
+						nb_req--;
+					}
+				}
+			}
+		}
+	}
+	FPRINTF(stderr, "[%d] All requests finished\n", rank);
+
+	starpu_data_unregister(send_handle[0]);
+	starpu_data_unregister(send_handle[1]);
+	starpu_data_unregister(recv_handle[0]);
+	starpu_data_unregister(recv_handle[1]);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 85 - 0
nmad/tests/pingpong.c

@@ -0,0 +1,85 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	16
+#else
+#  define NITER	2048
+#endif
+
+#define SIZE	16
+
+float *tab;
+starpu_data_handle_t tab_handle;
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	tab = malloc(SIZE*sizeof(float));
+
+	starpu_vector_data_register(&tab_handle, 0, (uintptr_t)tab, SIZE, sizeof(float));
+
+	int nloops = NITER;
+	int loop;
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		if ((loop % 2) == (rank%2))
+		{
+			//FPRINTF_MPI(stderr, "Sending to %d\n", other_rank);
+			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
+		}
+		else
+		{
+			MPI_Status status;
+			//FPRINTF_MPI(stderr, "Receiving from %d\n", other_rank);
+			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
+		}
+	}
+
+	starpu_data_unregister(tab_handle);
+	free(tab);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+	MPI_Finalize();
+
+	return 0;
+}

+ 133 - 0
nmad/tests/ring.c

@@ -0,0 +1,133 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
+
+int token = 42;
+starpu_data_handle_t token_handle;
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
+#endif
+
+void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda},
+#endif
+	.cpu_funcs = {increment_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+void increment_token(void)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+	task->synchronous = 1;
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
+
+	int nloops = NITER;
+	int loop;
+
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		int tag = loop*size + rank;
+
+		if (loop == 0 && rank == 0)
+		{
+			token = 0;
+			FPRINTF(stdout, "Start with token value %u\n", token);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_recv(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, &status);
+		}
+
+		increment_token();
+
+		if (loop == last_loop && rank == last_rank)
+		{
+			starpu_data_acquire(token_handle, STARPU_R);
+			FPRINTF(stdout, "Finished : token value %u\n", token);
+			starpu_data_release(token_handle);
+		}
+		else
+		{
+			starpu_mpi_send(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD);
+		}
+	}
+
+	starpu_data_unregister(token_handle);
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	if (rank == last_rank)
+	{
+		STARPU_ASSERT(token == nloops*size);
+	}
+
+	return 0;
+}

+ 137 - 0
nmad/tests/ring_async.c

@@ -0,0 +1,137 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
+
+int token = 42;
+starpu_data_handle_t token_handle;
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
+#endif
+
+void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda},
+#endif
+	.cpu_funcs = {increment_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+void increment_token(void)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+	task->synchronous = 1;
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
+
+	int nloops = NITER;
+	int loop;
+
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		int tag = loop*size + rank;
+
+		if (loop == 0 && rank == 0)
+		{
+			token = 0;
+			FPRINTF(stdout, "Start with token value %u\n", token);
+		}
+		else
+		{
+			MPI_Status status;
+			starpu_mpi_req req;
+			starpu_mpi_irecv(token_handle, &req, (rank+size-1)%size, tag, MPI_COMM_WORLD);
+			starpu_mpi_wait(&req, &status);
+		}
+
+		increment_token();
+
+		if (loop == last_loop && rank == last_rank)
+		{
+			starpu_data_acquire(token_handle, STARPU_R);
+			FPRINTF(stdout, "Finished : token value %u\n", token);
+			starpu_data_release(token_handle);
+		}
+		else {
+			starpu_mpi_req req;
+			MPI_Status status;
+			starpu_mpi_isend(token_handle, &req, (rank+1)%size, tag+1, MPI_COMM_WORLD);
+			starpu_mpi_wait(&req, &status);
+		}
+	}
+
+	starpu_data_unregister(token_handle);
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	MPI_Finalize();
+
+	if (rank == last_rank)
+	{
+		STARPU_ASSERT(token == nloops*size);
+	}
+
+	return 0;
+}

+ 131 - 0
nmad/tests/ring_async_implicit.c

@@ -0,0 +1,131 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#ifdef STARPU_QUICK_CHECK
+#  define NITER	32
+#else
+#  define NITER	2048
+#endif
+
+int token = 42;
+starpu_data_handle_t token_handle;
+
+#ifdef STARPU_USE_CUDA
+extern void increment_cuda(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
+#endif
+
+void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	(*tokenptr)++;
+}
+
+static struct starpu_codelet increment_cl =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda},
+#endif
+	.cpu_funcs = {increment_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+void increment_token(void)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &increment_cl;
+	task->handles[0] = token_handle;
+
+	int ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size < 2)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need at least 2 processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+
+	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
+
+	int nloops = NITER;
+	int loop;
+
+	int last_loop = nloops - 1;
+	int last_rank = size - 1;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		int tag = loop*size + rank;
+
+		if (loop == 0 && rank == 0)
+		{
+			token = 0;
+			FPRINTF(stdout, "Start with token value %u\n", token);
+		}
+		else
+		{
+			starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, NULL, NULL);
+		}
+
+		increment_token();
+
+		if (loop == last_loop && rank == last_rank)
+		{
+			starpu_data_acquire(token_handle, STARPU_R);
+			FPRINTF(stdout, "Finished : token value %u\n", token);
+			starpu_data_release(token_handle);
+		}
+		else
+		{
+			starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+	starpu_data_unregister(token_handle);
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	if (rank == last_rank)
+	{
+		FPRINTF(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
+		STARPU_ASSERT(token == nloops*size);
+	}
+
+	return 0;
+}

+ 0 - 0
nmad/tests/ring_kernel.cu


Някои файлове не бяха показани, защото твърде много файлове са промени