7 years ago · 7dad72f05a
--- a/nmad/examples/Makefile.am
+++ b/nmad/examples/Makefile.am
@@ -1,7 +1,8 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2013, 2016  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2009-2013, 2015-2017  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+# Copyright (C) 2016  Inria
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,59 +17,89 @@
 
				 
			
 
				 include $(top_srcdir)/starpu.mk
			
 
				 
			
 
				+if STARPU_SIMGRID
			
 
				+STARPU_PERF_MODEL_DIR=$(abs_top_srcdir)/tools/perfmodels/sampling
			
 
				+STARPU_HOSTNAME=mirage
			
 
				+MALLOC_PERTURB_=0
			
 
				+export STARPU_PERF_MODEL_DIR
			
 
				+export STARPU_HOSTNAME
			
 
				+export MALLOC_PERTURB_
			
 
				+endif
			
 
				+
			
 
				 CC=$(MPICC)
			
 
				 CCLD=$(MPICC)
			
 
				+FC=$(MPIFORT)
			
 
				+FCLD=$(MPIFORT)
			
 
				 
			
 
				 if STARPU_HAVE_WINDOWS
			
 
				 LOADER_BIN		=
			
 
				 else
			
 
				 loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
			
 
				+if !STARPU_SIMGRID
			
 
				 LOADER			=	loader
			
 
				-LOADER_BIN		=	$(abs_top_builddir)/nmad/tests/$(LOADER)
			
 
				+LOADER_BIN		=	$(abs_top_builddir)/nmad/examples/$(LOADER)
			
 
				+endif
			
 
				 loader_SOURCES		=	../../tests/loader.c
			
 
				 endif
			
 
				 
			
 
				+if STARPU_SIMGRID
			
 
				+MPI			=	$(abs_top_builddir)/tools/starpu_smpirun -np 4 -platform $(abs_top_srcdir)/tools/perfmodels/cluster.xml -hostfile $(abs_top_srcdir)/tools/perfmodels/hostfile
			
 
				+else
			
 
				+# we always test on 4 processes, the execution time is not that bigger
			
 
				+if STARPU_QUICK_CHECK
			
 
				+MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				+else
			
 
				+MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				 if STARPU_HAVE_AM111
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				-LOG_COMPILER	 	=	$(MPIEXEC) -np 2 $(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				+LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPIEXEC) -np 4
			
 
				+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				-if !STARPU_SIMGRID
			
 
				-if STARPU_MPI_CHECK
			
 
				+#if STARPU_MPI_CHECK
			
 
				 TESTS			=	$(starpu_mpi_EXAMPLES)
			
 
				-endif
			
 
				-endif
			
 
				+#endif
			
 
				 
			
 
				 check_PROGRAMS = $(LOADER) $(starpu_mpi_EXAMPLES)
			
 
				 starpu_mpi_EXAMPLES =
			
 
				 
			
 
				 BUILT_SOURCES =
			
 
				 
			
 
				-CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				+CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log *.mod
			
 
				 
			
 
				-EXTRA_DIST = 					\
			
 
				+EXTRA_DIST = 				\
			
 
				 	mpi_lu/mpi_lu-float.h		\
			
 
				 	mpi_lu/mpi_lu-double.h		\
			
 
				 	mpi_lu/plu_example.c		\
			
 
				+	mpi_lu/plu_implicit_example.c	\
			
 
				+	mpi_lu/plu_outofcore_example.c	\
			
 
				 	mpi_lu/plu_solve.c		\
			
 
				 	mpi_lu/pxlu.h			\
			
 
				 	mpi_lu/pxlu.c			\
			
 
				+	mpi_lu/pxlu_implicit.c		\
			
 
				 	mpi_lu/pxlu_kernels.h		\
			
 
				 	mpi_lu/pxlu_kernels.c		\
			
 
				+	matrix_decomposition/mpi_cholesky.h 		\
			
 
				 	matrix_decomposition/mpi_cholesky_codelets.h 	\
			
 
				 	matrix_decomposition/mpi_cholesky_kernels.h	\
			
 
				 	matrix_decomposition/mpi_cholesky_models.h 	\
			
 
				 	matrix_decomposition/mpi_decomposition_params.h	\
			
 
				 	matrix_decomposition/mpi_decomposition_matrix.h	\
			
 
				-	../tests/helper.h
			
 
				+	user_datatype/my_interface.h			\
			
 
				+	helper.h
			
 
				 
			
 
				 examplebindir = $(libdir)/starpu/mpi
			
 
				 
			
 
				 examplebin_PROGRAMS =
			
 
				 
			
 
				 if STARPU_USE_CUDA
			
 
				+if STARPU_COVERITY
			
 
				+include $(top_srcdir)/starpu-mynvcc.mk
			
 
				+else
			
 
				 NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
			
 
				 
			
 
				 .cu.cubin:
			
@@ -78,11 +109,12 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
				 .cu.o:
			
 
				 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
			
 
				 endif
			
 
				+endif
			
 
				 
			
 
				 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
			
 
				-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
			
 
				+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				 
			
 
				 ###################
			
 
				 # Stencil example #
			
@@ -91,24 +123,27 @@ if BUILD_EXAMPLES
 
				 examplebin_PROGRAMS +=				\
			
 
				 	stencil/stencil5
			
 
				 
			
 
				-stencil_stencil5_LDADD =		\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
			
 
				-
			
 
				 starpu_mpi_EXAMPLES	+=	\
			
 
				 	stencil/stencil5
			
 
				 
			
 
				+endif
			
 
				+
			
 
				 ##################
			
 
				 # MPI LU example #
			
 
				 ##################
			
 
				 
			
 
				+if BUILD_EXAMPLES
			
 
				 if !NO_BLAS_LIB
			
 
				 
			
 
				 examplebin_PROGRAMS += 			\
			
 
				 	mpi_lu/plu_example_float	\
			
 
				-	mpi_lu/plu_example_double
			
 
				+	mpi_lu/plu_example_double	\
			
 
				+	mpi_lu/plu_implicit_example_float	\
			
 
				+	mpi_lu/plu_implicit_example_double	\
			
 
				+	mpi_lu/plu_outofcore_example_float	\
			
 
				+	mpi_lu/plu_outofcore_example_double
			
 
				 
			
 
				 mpi_lu_plu_example_float_LDADD =	\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				 	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
@@ -117,10 +152,9 @@ mpi_lu_plu_example_float_SOURCES =	\
 
				 	mpi_lu/plu_solve_float.c	\
			
 
				 	mpi_lu/pslu_kernels.c		\
			
 
				 	mpi_lu/pslu.c			\
			
 
				-	$(top_srcdir)/examples/common/blas.c
			
 
				+	../../examples/common/blas.c
			
 
				 
			
 
				 mpi_lu_plu_example_double_LDADD =	\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				 	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
@@ -129,13 +163,59 @@ mpi_lu_plu_example_double_SOURCES =	\
 
				 	mpi_lu/plu_solve_double.c  	\
			
 
				 	mpi_lu/pdlu_kernels.c	    	\
			
 
				 	mpi_lu/pdlu.c		    	\
			
 
				-	$(top_srcdir)/examples/common/blas.c
			
 
				+	../../examples/common/blas.c
			
 
				+
			
 
				+mpi_lu_plu_implicit_example_float_LDADD =	\
			
 
				+	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				+
			
 
				+mpi_lu_plu_implicit_example_float_SOURCES =	\
			
 
				+	mpi_lu/plu_implicit_example_float.c	\
			
 
				+	mpi_lu/plu_solve_float.c		\
			
 
				+	mpi_lu/pslu_kernels.c			\
			
 
				+	mpi_lu/pslu_implicit.c			\
			
 
				+	../../examples/common/blas.c
			
 
				+
			
 
				+mpi_lu_plu_implicit_example_double_LDADD =	\
			
 
				+	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				+
			
 
				+mpi_lu_plu_implicit_example_double_SOURCES =	\
			
 
				+	mpi_lu/plu_implicit_example_double.c	\
			
 
				+	mpi_lu/plu_solve_double.c		\
			
 
				+	mpi_lu/pdlu_kernels.c			\
			
 
				+	mpi_lu/pdlu_implicit.c			\
			
 
				+	../../examples/common/blas.c
			
 
				+
			
 
				+mpi_lu_plu_outofcore_example_float_LDADD =	\
			
 
				+	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				+
			
 
				+mpi_lu_plu_outofcore_example_float_SOURCES =	\
			
 
				+	mpi_lu/plu_outofcore_example_float.c	\
			
 
				+	mpi_lu/plu_solve_float.c		\
			
 
				+	mpi_lu/pslu_kernels.c			\
			
 
				+	mpi_lu/pslu_implicit.c			\
			
 
				+	../../examples/common/blas.c
			
 
				+
			
 
				+mpi_lu_plu_outofcore_example_double_LDADD =	\
			
 
				+	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				+	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				+
			
 
				+mpi_lu_plu_outofcore_example_double_SOURCES =	\
			
 
				+	mpi_lu/plu_outofcore_example_double.c	\
			
 
				+	mpi_lu/plu_solve_double.c		\
			
 
				+	mpi_lu/pdlu_kernels.c			\
			
 
				+	mpi_lu/pdlu_implicit.c			\
			
 
				+	../../examples/common/blas.c
			
 
				+endif
			
 
				 endif
			
 
				 
			
 
				 ########################
			
 
				 # MPI Cholesky example #
			
 
				 ########################
			
 
				 
			
 
				+if BUILD_EXAMPLES
			
 
				 if !NO_BLAS_LIB
			
 
				 examplebin_PROGRAMS +=		\
			
 
				 	matrix_decomposition/mpi_cholesky			\
			
@@ -148,10 +228,9 @@ matrix_decomposition_mpi_cholesky_SOURCES	=		\
 
				 	matrix_decomposition/mpi_cholesky_codelets.c	\
			
 
				 	matrix_decomposition/mpi_decomposition_params.c	\
			
 
				 	matrix_decomposition/mpi_decomposition_matrix.c	\
			
 
				-	$(top_srcdir)/examples/common/blas.c
			
 
				+	../../examples/common/blas.c
			
 
				 
			
 
				 matrix_decomposition_mpi_cholesky_LDADD =			\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
 
				 matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
			
@@ -161,33 +240,151 @@ matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 
				 	matrix_decomposition/mpi_cholesky_codelets.c	\
			
 
				 	matrix_decomposition/mpi_decomposition_params.c	\
			
 
				 	matrix_decomposition/mpi_decomposition_matrix.c	\
			
 
				-	$(top_srcdir)/examples/common/blas.c
			
 
				+	../../examples/common/blas.c
			
 
				 
			
 
				 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
 
				+if !STARPU_SIMGRID
			
 
				 starpu_mpi_EXAMPLES +=				\
			
 
				 	matrix_decomposition/mpi_cholesky			\
			
 
				 	matrix_decomposition/mpi_cholesky_distributed
			
 
				 endif
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+########################
			
 
				+# MPI Matrix mult example #
			
 
				+########################
			
 
				+
			
 
				+if BUILD_EXAMPLES
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	matrix_mult/mm
			
 
				+
			
 
				+matrix_mult_mm_SOURCES	=		\
			
 
				+	matrix_mult/mm.c
			
 
				+
			
 
				+matrix_mult_mm_LDADD =			\
			
 
				+	-lm
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES +=				\
			
 
				+	matrix_mult/mm
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+##########################################
			
 
				+# Native Fortran MPI Matrix mult example #
			
 
				+##########################################
			
 
				+
			
 
				+if STARPU_HAVE_MPIFORT
			
 
				+if BUILD_EXAMPLES
			
 
				+if !STARPU_SANITIZE
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	native_fortran/nf_mm	\
			
 
				+	native_fortran/nf_basic_ring
			
 
				+
			
 
				+native_fortran_nf_mm_SOURCES	=			\
			
 
				+	native_fortran/nf_mm_cl.f90			\
			
 
				+	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90	\
			
 
				+	$(top_srcdir)/include/fstarpu_mod.f90		\
			
 
				+	native_fortran/nf_mm.f90
			
 
				+
			
 
				+native_fortran_nf_mm_LDADD =					\
			
 
				+	-lm
			
 
				+
			
 
				+native_fortran_nf_basic_ring_SOURCES	=			\
			
 
				+	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90	\
			
 
				+	$(top_srcdir)/include/fstarpu_mod.f90		\
			
 
				+	native_fortran/nf_basic_ring.f90
			
 
				+
			
 
				+native_fortran_nf_basic_ring_LDADD =					\
			
 
				+	-lm
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES +=				\
			
 
				+	native_fortran/nf_mm			\
			
 
				+	native_fortran/nf_basic_ring
			
 
				+endif
			
 
				+endif
			
 
				+endif
			
 
				+endif
			
 
				 
			
 
				 ###################
			
 
				 # complex example #
			
 
				 ###################
			
 
				 
			
 
				+if BUILD_EXAMPLES
			
 
				 examplebin_PROGRAMS +=			\
			
 
				 	complex/mpi_complex
			
 
				 
			
 
				 complex_mpi_complex_SOURCES =		\
			
 
				 	complex/mpi_complex.c		\
			
 
				-	$(top_srcdir)/examples/interface/complex_interface.c
			
 
				-
			
 
				-complex_mpi_complex_LDADD =		\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				+	../../examples/interface/complex_interface.c
			
 
				 
			
 
				 starpu_mpi_EXAMPLES	+=			\
			
 
				 	complex/mpi_complex
			
 
				 endif
			
 
				 
			
 
				+#########################
			
 
				+# user_datatype example #
			
 
				+#########################
			
 
				+
			
 
				+if BUILD_EXAMPLES
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	user_datatype/user_datatype
			
 
				+
			
 
				+user_datatype_user_datatype_SOURCES =		\
			
 
				+	user_datatype/user_datatype.c		\
			
 
				+	user_datatype/my_interface.c
			
 
				 
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES	+=			\
			
 
				+	user_datatype/user_datatype
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+###################
			
 
				+# comm example #
			
 
				+###################
			
 
				+
			
 
				+if BUILD_EXAMPLES
			
 
				+examplebin_PROGRAMS +=			\
			
 
				+	comm/comm			\
			
 
				+	comm/mix_comm
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES	+=			\
			
 
				+	comm/comm				\
			
 
				+	comm/mix_comm
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+if STARPU_HAVE_MPIFORT
			
 
				+if BUILD_EXAMPLES
			
 
				+if !STARPU_SANITIZE
			
 
				+# Native Fortran example
			
 
				+# - list explicit dependences to control proper module files generation
			
 
				+# - the overriding rule fully disables the corresponing default rule, thus
			
 
				+#   the default rule body must be copied entirely
			
 
				+fstarpu_mod.mod: fstarpu_mod.o
			
 
				+fstarpu_mpi_mod.mod: fstarpu_mpi_mod.o
			
 
				+nf_mm_cl.mod: nf_mm_cl.o
			
 
				+
			
 
				+fstarpu_mod.o: $(top_srcdir)/include/fstarpu_mod.f90
			
 
				+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ '$(top_srcdir)/'include/fstarpu_mod.f90
			
 
				+
			
 
				+fstarpu_mpi_mod.o: $(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90 fstarpu_mod.mod
			
 
				+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ '$(top_srcdir)/'mpi/include/fstarpu_mpi_mod.f90
			
 
				+
			
 
				+nf_mm_cl.o: $(top_srcdir)/mpi/examples/native_fortran/nf_mm_cl.f90 fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_mm_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_mm_cl.f90
			
 
				+
			
 
				+nf_mm.o: $(top_srcdir)/mpi/examples/native_fortran/nf_mm.f90 nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_mm.f90' || echo '$(srcdir)/'`native_fortran/nf_mm.f90
			
 
				+
			
 
				+nf_basic_ring.o: $(top_srcdir)/mpi/examples/native_fortran/nf_basic_ring.f90 fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				+	$(AM_V_FC)$(FC) $(native_fortran_nf_basic_ring_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_basic_ring.f90' || echo '$(srcdir)/'`native_fortran/nf_basic_ring.f90
			
 
				+endif
			
 
				+endif
			
 
				+endif
			
--- a/nmad/examples/comm/comm.c
+++ b/nmad/examples/comm/comm.c
@@ -0,0 +1,150 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example splits the whole set of communicators in subgroups,
			
 
				+ * all communications take place within each subgroups
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *value = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	int rank;
			
 
				+
			
 
				+	starpu_codelet_unpack_args(_args, &rank);
			
 
				+	FPRINTF_MPI(stderr, "Executing codelet with value %d and rank %d\n", *value, rank);
			
 
				+	STARPU_ASSERT_MSG(*value == rank, "Received value %d is not the expected value %d\n", *value, rank);
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int size, x=789;
			
 
				+	int color;
			
 
				+	MPI_Comm newcomm;
			
 
				+	int rank, newrank;
			
 
				+	int ret;
			
 
				+	starpu_data_handle_t data[2];
			
 
				+	int thread_support;
			
 
				+
			
 
				+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
			
 
				+	{
			
 
				+		fprintf(stderr,"MPI_Init_thread failed\n");
			
 
				+		exit(1);
			
 
				+	}
			
 
				+	if (thread_support == MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
			
 
				+	if (thread_support < MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
			
 
				+
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+        MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+        if (size < 4)
			
 
				+        {
			
 
				+		FPRINTF(stderr, "We need at least 4 processes.\n");
			
 
				+                MPI_Finalize();
			
 
				+                return STARPU_TEST_SKIPPED;
			
 
				+        }
			
 
				+
			
 
				+	color = rank%2;
			
 
				+	MPI_Comm_split(MPI_COMM_WORLD, color, rank, &newcomm);
			
 
				+	MPI_Comm_rank(newcomm, &newrank);
			
 
				+	FPRINTF(stderr, "[%d][%d] color %d\n", rank, newrank, color);
			
 
				+
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "[%d][%d] sending %d\n", rank, newrank, rank);
			
 
				+		MPI_Send(&rank, 1, MPI_INT, 1, 10, newcomm);
			
 
				+	}
			
 
				+	else if (newrank == 1)
			
 
				+	{
			
 
				+		MPI_Recv(&x, 1, MPI_INT, 0, 10, newcomm, MPI_STATUS_IGNORE);
			
 
				+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, x);
			
 
				+	}
			
 
				+
			
 
				+        ret = starpu_init(NULL);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+        ret = starpu_mpi_init_comm(NULL, NULL, 0, newcomm);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		starpu_variable_data_register(&data[0], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
			
 
				+		starpu_variable_data_register(&data[1], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
			
 
				+		starpu_mpi_data_register_comm(data[1], 22, 0, newcomm);
			
 
				+	}
			
 
				+	else
			
 
				+		starpu_variable_data_register(&data[0], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+	starpu_mpi_data_register_comm(data[0], 12, 0, newcomm);
			
 
				+
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		starpu_mpi_req req[2];
			
 
				+		starpu_mpi_issend(data[1], &req[0], 1, 22, newcomm);
			
 
				+		starpu_mpi_isend(data[0], &req[1], 1, 12, newcomm);
			
 
				+		starpu_mpi_wait(&req[0], MPI_STATUS_IGNORE);
			
 
				+		starpu_mpi_wait(&req[1], MPI_STATUS_IGNORE);
			
 
				+	}
			
 
				+	else if (newrank == 1)
			
 
				+	{
			
 
				+		int *xx;
			
 
				+
			
 
				+		starpu_mpi_recv(data[0], 0, 12, newcomm, MPI_STATUS_IGNORE);
			
 
				+		starpu_data_acquire(data[0], STARPU_RW);
			
 
				+		xx = (int *)starpu_variable_get_local_ptr(data[0]);
			
 
				+		starpu_data_release(data[0]);
			
 
				+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, *xx);
			
 
				+		STARPU_ASSERT_MSG(x==*xx, "Received value %d is incorrect (should be %d)\n", *xx, x);
			
 
				+
			
 
				+		starpu_variable_data_register(&data[1], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+		starpu_mpi_data_register_comm(data[1], 22, 0, newcomm);
			
 
				+		starpu_mpi_recv(data[0], 0, 22, newcomm, MPI_STATUS_IGNORE);
			
 
				+		starpu_data_acquire(data[0], STARPU_RW);
			
 
				+		xx = (int *)starpu_variable_get_local_ptr(data[0]);
			
 
				+		starpu_data_release(data[0]);
			
 
				+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, *xx);
			
 
				+		STARPU_ASSERT_MSG(x==*xx, "Received value %d is incorrect (should be %d)\n", *xx, x);
			
 
				+	}
			
 
				+
			
 
				+	if (newrank == 0 || newrank == 1)
			
 
				+	{
			
 
				+		starpu_mpi_task_insert(newcomm, &mycodelet,
			
 
				+				       STARPU_RW, data[0],
			
 
				+				       STARPU_VALUE, &x, sizeof(x),
			
 
				+				       STARPU_EXECUTE_ON_NODE, 1,
			
 
				+				       0);
			
 
				+
			
 
				+		starpu_task_wait_for_all();
			
 
				+		starpu_data_unregister(data[0]);
			
 
				+		starpu_data_unregister(data[1]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+	MPI_Comm_free(&newcomm);
			
 
				+        MPI_Finalize();
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/examples/comm/mix_comm.c
+++ b/nmad/examples/comm/mix_comm.c
@@ -0,0 +1,185 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example splits the whole set of communicators in subgroups,
			
 
				+ * communications take place both within each subgroups and MPI_COMM_WORLD.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	int *value = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	int rank;
			
 
				+
			
 
				+	starpu_codelet_unpack_args(_args, &rank);
			
 
				+	FPRINTF_MPI(stderr, "Executing codelet with value %d and rank %d\n", *value, rank);
			
 
				+	STARPU_ASSERT_MSG(*value == rank, "Received value %d is not the expected value %d\n", *value, rank);
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.cpu_funcs = {func_cpu},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int size, x;
			
 
				+	int color;
			
 
				+	MPI_Comm newcomm;
			
 
				+	int rank, newrank;
			
 
				+	int ret;
			
 
				+	starpu_data_handle_t data[3];
			
 
				+	int value = 90;
			
 
				+	int thread_support;
			
 
				+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
			
 
				+	{
			
 
				+		fprintf(stderr,"MPI_Init_thread failed\n");
			
 
				+		exit(1);
			
 
				+	}
			
 
				+	if (thread_support == MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
			
 
				+	if (thread_support < MPI_THREAD_FUNNELED)
			
 
				+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
			
 
				+
			
 
				+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+        MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+        if (size < 4)
			
 
				+        {
			
 
				+		FPRINTF(stderr, "We need at least 4 processes.\n");
			
 
				+                MPI_Finalize();
			
 
				+                return STARPU_TEST_SKIPPED;
			
 
				+        }
			
 
				+
			
 
				+	color = rank%2;
			
 
				+	MPI_Comm_split(MPI_COMM_WORLD, color, rank, &newcomm);
			
 
				+	MPI_Comm_rank(newcomm, &newrank);
			
 
				+	FPRINTF(stderr, "[%d][%d] color %d\n", rank, newrank, color);
			
 
				+
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "[%d][%d] sending %d\n", rank, newrank, rank);
			
 
				+		MPI_Send(&rank, 1, MPI_INT, 1, 10, newcomm);
			
 
				+	}
			
 
				+	else if (newrank == 1)
			
 
				+	{
			
 
				+		MPI_Recv(&x, 1, MPI_INT, 0, 10, newcomm, MPI_STATUS_IGNORE);
			
 
				+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, x);
			
 
				+	}
			
 
				+
			
 
				+        ret = starpu_init(NULL);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+        ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		starpu_variable_data_register(&data[2], STARPU_MAIN_RAM, (uintptr_t)&value, sizeof(int));
			
 
				+	}
			
 
				+	else
			
 
				+		starpu_variable_data_register(&data[2], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+	starpu_mpi_data_register_comm(data[2], 44, 0, MPI_COMM_WORLD);
			
 
				+
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		starpu_variable_data_register(&data[0], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
			
 
				+		starpu_variable_data_register(&data[1], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
			
 
				+		starpu_mpi_data_register_comm(data[1], 22, 0, newcomm);
			
 
				+	}
			
 
				+	else
			
 
				+		starpu_variable_data_register(&data[0], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+	starpu_mpi_data_register_comm(data[0], 12, 0, newcomm);
			
 
				+
			
 
				+	if (newrank == 0)
			
 
				+	{
			
 
				+		starpu_mpi_req req[2];
			
 
				+		starpu_mpi_issend(data[1], &req[0], 1, 22, newcomm);
			
 
				+		starpu_mpi_isend(data[0], &req[1], 1, 12, newcomm);
			
 
				+		starpu_mpi_wait(&req[0], MPI_STATUS_IGNORE);
			
 
				+		starpu_mpi_wait(&req[1], MPI_STATUS_IGNORE);
			
 
				+	}
			
 
				+	else if (newrank == 1)
			
 
				+	{
			
 
				+		int *xx;
			
 
				+
			
 
				+		starpu_mpi_recv(data[0], 0, 12, newcomm, MPI_STATUS_IGNORE);
			
 
				+		starpu_data_acquire(data[0], STARPU_RW);
			
 
				+		xx = (int *)starpu_variable_get_local_ptr(data[0]);
			
 
				+		starpu_data_release(data[0]);
			
 
				+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, *xx);
			
 
				+		STARPU_ASSERT_MSG(x==*xx, "Received value %d is incorrect (should be %d)\n", *xx, x);
			
 
				+
			
 
				+		starpu_variable_data_register(&data[1], -1, (uintptr_t)NULL, sizeof(int));
			
 
				+		starpu_mpi_data_register_comm(data[1], 22, 0, newcomm);
			
 
				+		starpu_mpi_recv(data[0], 0, 22, newcomm, MPI_STATUS_IGNORE);
			
 
				+		starpu_data_acquire(data[0], STARPU_RW);
			
 
				+		xx = (int *)starpu_variable_get_local_ptr(data[0]);
			
 
				+		starpu_data_release(data[0]);
			
 
				+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, *xx);
			
 
				+		STARPU_ASSERT_MSG(x==*xx, "Received value %d is incorrect (should be %d)\n", *xx, x);
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		starpu_data_acquire(data[2], STARPU_RW);
			
 
				+		int rvalue = *((int *)starpu_variable_get_local_ptr(data[2]));
			
 
				+		starpu_data_release(data[2]);
			
 
				+		FPRINTF_MPI(stderr, "sending value %d to %d and receiving from %d\n", rvalue, 1, size-1);
			
 
				+		starpu_mpi_send(data[2], 1, 44, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_recv(data[2], size-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				+		starpu_data_acquire(data[2], STARPU_RW);
			
 
				+		int *xx = (int *)starpu_variable_get_local_ptr(data[2]);
			
 
				+		starpu_data_release(data[2]);
			
 
				+		FPRINTF_MPI(stderr, "Value back is %d\n", *xx);
			
 
				+		STARPU_ASSERT_MSG(*xx == rvalue + (2*(size-1)), "Received value %d is incorrect (should be %d)\n", *xx, rvalue + (2*(size-1)));
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		int next = (rank == size-1) ? 0 : rank+1;
			
 
				+		starpu_mpi_recv(data[2], rank-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				+		starpu_data_acquire(data[2], STARPU_RW);
			
 
				+		int *xx = (int *)starpu_variable_get_local_ptr(data[2]);
			
 
				+		FPRINTF_MPI(stderr, "receiving %d from %d and sending %d to %d\n", *xx, rank-1, *xx+2, next);
			
 
				+		*xx = *xx + 2;
			
 
				+		starpu_data_release(data[2]);
			
 
				+		starpu_mpi_send(data[2], next, 44, MPI_COMM_WORLD);
			
 
				+	}
			
 
				+
			
 
				+	if (newrank == 0 || newrank == 1)
			
 
				+	{
			
 
				+		starpu_mpi_task_insert(newcomm, &mycodelet,
			
 
				+				       STARPU_RW, data[0],
			
 
				+				       STARPU_VALUE, &x, sizeof(x),
			
 
				+				       STARPU_EXECUTE_ON_NODE, 1,
			
 
				+				       0);
			
 
				+
			
 
				+		starpu_task_wait_for_all();
			
 
				+		starpu_data_unregister(data[0]);
			
 
				+		starpu_data_unregister(data[1]);
			
 
				+	}
			
 
				+	starpu_data_unregister(data[2]);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+	MPI_Comm_free(&newcomm);
			
 
				+        MPI_Finalize();
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/examples/complex/mpi_complex.c
+++ b/nmad/examples/complex/mpi_complex.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,85 +18,109 @@
 
				 #include <interface/complex_interface.h>
			
 
				 #include <interface/complex_codelet.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				 void display_foo_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	int *foo = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-	fprintf(stderr, "foo = %d\n", *foo);
			
 
				+	FPRINTF(stderr, "foo = %d\n", *foo);
			
 
				+}
			
 
				+
			
 
				+/* Dumb performance model for simgrid */
			
 
				+static double display_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) nimpl;
			
 
				+	return 0.000001;
			
 
				 }
			
 
				 
			
 
				+static struct starpu_perfmodel display_model =
			
 
				+{
			
 
				+	.type = STARPU_COMMON,
			
 
				+	.cost_function = display_cost_function,
			
 
				+	.symbol = "display"
			
 
				+};
			
 
				+
			
 
				 struct starpu_codelet foo_display =
			
 
				 {
			
 
				 	.cpu_funcs = {display_foo_codelet},
			
 
				 	.nbuffers = 1,
			
 
				-	.modes = {STARPU_R}
			
 
				+	.modes = {STARPU_R},
			
 
				+	.model = &display_model
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int rank, nodes;
			
 
				 	int ret;
			
 
				-	int compare;
			
 
				+	int compare=0;
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
			
 
				 
			
 
				-	if (nodes < 2)
			
 
				+	if (nodes < 2 || (starpu_cpu_worker_get_count() == 0))
			
 
				 	{
			
 
				-		fprintf(stderr, "This program needs at least 2 nodes (%d available)\n", nodes);
			
 
				-		ret = 77;
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			if (nodes < 2)
			
 
				+				fprintf(stderr, "We need at least 2 processes.\n");
			
 
				+			else
			
 
				+				fprintf(stderr, "We need at least 1 CPU.\n");
			
 
				+		}
			
 
				+		starpu_mpi_shutdown();
			
 
				+		starpu_shutdown();
			
 
				+		return 77;
			
 
				 	}
			
 
				-	else
			
 
				-	{
			
 
				-		starpu_data_handle_t handle;
			
 
				-		starpu_data_handle_t handle2;
			
 
				-
			
 
				-		double real[2] = {4.0, 2.0};
			
 
				-		double imaginary[2] = {7.0, 9.0};
			
 
				 
			
 
				-		double real2[2] = {14.0, 12.0};
			
 
				-		double imaginary2[2] = {17.0, 19.0};
			
 
				+	starpu_data_handle_t handle;
			
 
				+	starpu_data_handle_t handle2;
			
 
				 
			
 
				-		if (rank == 1)
			
 
				-		{
			
 
				-			real[0] = 0.0;
			
 
				-			real[1] = 0.0;
			
 
				-			imaginary[0] = 0.0;
			
 
				-			imaginary[1] = 0.0;
			
 
				-		}
			
 
				+	double real[2] = {4.0, 2.0};
			
 
				+	double imaginary[2] = {7.0, 9.0};
			
 
				 
			
 
				-		starpu_complex_data_register(&handle, 0, real, imaginary, 2);
			
 
				-		starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
			
 
				+	double real2[2] = {14.0, 12.0};
			
 
				+	double imaginary2[2] = {17.0, 19.0};
			
 
				 
			
 
				-		if (rank == 0)
			
 
				-		{
			
 
				-			int *compare_ptr = &compare;
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+		real[0] = 0.0;
			
 
				+		real[1] = 0.0;
			
 
				+		imaginary[0] = 0.0;
			
 
				+		imaginary[1] = 0.0;
			
 
				+	}
			
 
				 
			
 
				-			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle, 0);
			
 
				-			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				-			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				+	starpu_complex_data_register(&handle, STARPU_MAIN_RAM, real, imaginary, 2);
			
 
				+	starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
			
 
				 
			
 
				-			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle2, 0);
			
 
				-			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
			
 
				-		}
			
 
				-		else if (rank == 1)
			
 
				-		{
			
 
				-			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				-			starpu_insert_task(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle, 0);
			
 
				-			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				-		}
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		int *compare_ptr = &compare;
			
 
				 
			
 
				-		starpu_task_wait_for_all();
			
 
				+		starpu_task_insert(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle, 0);
			
 
				+		starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				+		starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				 
			
 
				-		starpu_data_unregister(handle);
			
 
				-		starpu_data_unregister(handle2);
			
 
				+		starpu_task_insert(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle2, 0);
			
 
				+		starpu_task_insert(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				+		starpu_task_insert(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle, 0);
			
 
				+		starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				 	}
			
 
				 
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unregister(handle);
			
 
				+	starpu_data_unregister(handle2);
			
 
				+
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	if (rank == 0) return !compare; else return ret;
			
 
				+	return (rank == 0) ? !compare : 0;
			
 
				 }
			
--- a/nmad/examples/helper.h
+++ b/nmad/examples/helper.h
@@ -0,0 +1,27 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2015  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <errno.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+
			
 
				+#define STARPU_TEST_SKIPPED 77
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+#define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
			
 
				+    						int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
			
 
				+                                                fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				+                                                fflush(ofile); }} while(0);
			
 
				+
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2012, 2015  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,11 +16,8 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu_mpi.h>
			
 
				-#include "mpi_cholesky_models.h"
			
 
				-#include "mpi_cholesky_codelets.h"
			
 
				-#include "mpi_decomposition_matrix.h"
			
 
				-#include "mpi_decomposition_params.h"
			
 
				+#include "mpi_cholesky.h"
			
 
				+#include "helper.h"
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
@@ -32,16 +29,30 @@ int main(int argc, char **argv)
 
				 	float ***bmat;
			
 
				 	int rank, nodes, ret;
			
 
				 	double timing, flops;
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	int correctness;
			
 
				+#endif
			
 
				+
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				+	if (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() == 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "We need at least 1 CPU or CUDA worker.\n");
			
 
				+		}
			
 
				+		starpu_mpi_shutdown();
			
 
				+		starpu_shutdown();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				 	parse_args(argc, argv, nodes);
			
 
				 
			
 
				 	matrix_init(&bmat, rank, nodes, 1);
			
@@ -51,20 +62,24 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	matrix_display(bmat, rank);
			
 
				 
			
 
				 	dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops);
			
 
				+#endif
			
 
				 
			
 
				 	matrix_free(&bmat, rank, nodes, 1);
			
 
				 	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	assert(correctness);
			
 
				+#endif
			
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				+		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				+		FPRINTF(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky.h
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky.h
@@ -0,0 +1,31 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2013, 2015  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MPI_CHOLESKY_H__
			
 
				+#define __MPI_CHOLESKY_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "mpi_cholesky_codelets.h"
			
 
				+#include "mpi_cholesky_kernels.h"
			
 
				+#include "mpi_cholesky_models.h"
			
 
				+#include "mpi_decomposition_matrix.h"
			
 
				+#include "mpi_decomposition_params.h"
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+#endif // __MPI_CHOLESKY_H__
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,14 +15,10 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu_mpi.h>
			
 
				+#include "mpi_cholesky.h"
			
 
				 #include <common/blas.h>
			
 
				-#include "mpi_decomposition_params.h"
			
 
				-#include "mpi_decomposition_matrix.h"
			
 
				-#include "mpi_cholesky_models.h"
			
 
				-#include "mpi_cholesky_codelets.h"
			
 
				-#include "mpi_cholesky_kernels.h"
			
 
				 #include <sys/time.h>
			
 
				+#include <limits.h>
			
 
				 
			
 
				 /*
			
 
				  *	Create the codelets
			
@@ -33,6 +29,8 @@ static struct starpu_codelet cl11 =
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u11},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u11},
			
 
				+#elif defined(STARPU_SIMGRID)
			
 
				+	.cuda_funcs = {(void*)1},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_RW},
			
@@ -44,6 +42,8 @@ static struct starpu_codelet cl21 =
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u21},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u21},
			
 
				+#elif defined(STARPU_SIMGRID)
			
 
				+	.cuda_funcs = {(void*)1},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_RW},
			
@@ -55,9 +55,11 @@ static struct starpu_codelet cl22 =
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u22},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u22},
			
 
				+#elif defined(STARPU_SIMGRID)
			
 
				+	.cuda_funcs = {(void*)1},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				-	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW | STARPU_COMMUTE},
			
 
				 	.model = &chol_model_22
			
 
				 };
			
 
				 
			
@@ -72,6 +74,8 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 	starpu_data_handle_t **data_handles;
			
 
				 	unsigned x,y,i,j,k;
			
 
				 
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				 	/* create all the DAG nodes */
			
 
				 
			
 
				 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
			
@@ -85,10 +89,12 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 			if (mpi_rank == rank)
			
 
				 			{
			
 
				 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
			
 
				-				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
			
 
				+				starpu_matrix_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)matA[x][y],
			
 
				 						ld, size/nblocks, size/nblocks, sizeof(float));
			
 
				 			}
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning TODO: make better test to only register what is needed
			
 
				+#endif
			
 
				 			else
			
 
				 			{
			
 
				 				/* I don't own that index, but will need it for my computations */
			
@@ -98,6 +104,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 			}
			
 
				 			if (data_handles[x][y])
			
 
				 			{
			
 
				+				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
			
 
				 				starpu_mpi_data_register(data_handles[x][y], (y*nblocks)+x, mpi_rank);
			
 
				 			}
			
 
				 		}
			
@@ -108,43 +115,43 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				-		int prio = STARPU_DEFAULT_PRIO;
			
 
				-		if (!noprio) prio = STARPU_MAX_PRIO;
			
 
				+		starpu_iteration_push(k);
			
 
				 
			
 
				-		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
			
 
				-				STARPU_PRIORITY, prio,
			
 
				-				STARPU_RW, data_handles[k][k],
			
 
				-				0);
			
 
				+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				+				       STARPU_RW, data_handles[k][k],
			
 
				+				       0);
			
 
				 
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				-			prio = STARPU_DEFAULT_PRIO;
			
 
				-			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
			
 
				-			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
			
 
				-					STARPU_PRIORITY, prio,
			
 
				-					STARPU_R, data_handles[k][k],
			
 
				-					STARPU_RW, data_handles[k][j],
			
 
				-					0);
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+					       STARPU_R, data_handles[k][k],
			
 
				+					       STARPU_RW, data_handles[k][j],
			
 
				+					       0);
			
 
				 
			
 
				 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
			
 
				+			if (my_distrib(k, k, nodes) == rank)
			
 
				+				starpu_data_wont_use(data_handles[k][k]);
			
 
				 
			
 
				 			for (i = k+1; i<nblocks; i++)
			
 
				 			{
			
 
				 				if (i <= j)
			
 
				 				{
			
 
				-					prio = STARPU_DEFAULT_PRIO;
			
 
				-					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
			
 
				-					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
			
 
				-							STARPU_PRIORITY, prio,
			
 
				-							STARPU_R, data_handles[k][i],
			
 
				-							STARPU_R, data_handles[k][j],
			
 
				-							STARPU_RW, data_handles[i][j],
			
 
				-							0);
			
 
				+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+							       STARPU_R, data_handles[k][i],
			
 
				+							       STARPU_R, data_handles[k][j],
			
 
				+							       STARPU_RW | STARPU_COMMUTE, data_handles[i][j],
			
 
				+							       0);
			
 
				 				}
			
 
				 			}
			
 
				 
			
 
				 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][j]);
			
 
				+			if (my_distrib(k, j, nodes) == rank)
			
 
				+				starpu_data_wont_use(data_handles[k][j]);
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
@@ -189,7 +196,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
			
 
				+	FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
@@ -206,7 +213,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 	STARPU_SSYRK("L", "N", size, size, 1.0f,
			
 
				 			rmat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				-	fprintf(stderr, "[%d] comparing results ...\n", rank);
			
 
				+	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
			
 
				 	if (display)
			
 
				 	{
			
 
				 		for (j = 0; j < size; j++)
			
@@ -244,7 +251,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 							float err = abs(test_mat[j +i*size] - orig);
			
 
				 							if (err > 0.00001)
			
 
				 							{
			
 
				-								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
			
 
				+								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
			
 
				 								*correctness = 0;
			
 
				 								*flops = 0;
			
 
				 								break;
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_codelets.h
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_codelets.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_distributed.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_distributed.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2011  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,11 +16,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu_mpi.h>
			
 
				-#include "mpi_cholesky_models.h"
			
 
				-#include "mpi_cholesky_codelets.h"
			
 
				-#include "mpi_decomposition_matrix.h"
			
 
				-#include "mpi_decomposition_params.h"
			
 
				+#include "mpi_cholesky.h"
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
@@ -38,8 +34,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
			
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 	parse_args(argc, argv, nodes);
			
@@ -56,8 +52,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				+		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
			
 
				+		FPRINTF(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_kernels.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2012-2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,9 +15,8 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu.h>
			
 
				+#include "mpi_cholesky.h"
			
 
				 #include <math.h>
			
 
				-#include "mpi_decomposition_params.h"
			
 
				 #include "common/blas.h"
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_kernels.h
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_kernels.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_models.c
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_models.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,7 +15,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "mpi_cholesky_models.h"
			
 
				+#include "mpi_cholesky.h"
			
 
				 
			
 
				 /*
			
 
				  *	Number of flops of Gemm
			
--- a/nmad/examples/matrix_decomposition/mpi_cholesky_models.h
+++ b/nmad/examples/matrix_decomposition/mpi_cholesky_models.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2013, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,8 +18,6 @@
 
				 #ifndef __DW_CHOLESKY_MODELS_H__
			
 
				 #define __DW_CHOLESKY_MODELS_H__
			
 
				 
			
 
				-#include <starpu.h>
			
 
				-
			
 
				 extern struct starpu_perfmodel chol_model_11;
			
 
				 extern struct starpu_perfmodel chol_model_21;
			
 
				 extern struct starpu_perfmodel chol_model_22;
			
--- a/nmad/examples/matrix_decomposition/mpi_decomposition_matrix.c
+++ b/nmad/examples/matrix_decomposition/mpi_decomposition_matrix.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2012, 2015  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,10 +16,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu.h>
			
 
				-#include "mpi_decomposition_matrix.h"
			
 
				-#include "mpi_decomposition_params.h"
			
 
				-#include "mpi_cholesky_codelets.h"
			
 
				+#include "mpi_cholesky.h"
			
 
				 
			
 
				 /* Returns the MPI node number where data indexes index is */
			
 
				 int my_distrib(int x, int y, int nb_nodes)
			
@@ -31,19 +28,21 @@ int my_distrib(int x, int y, int nb_nodes)
 
				 
			
 
				 void matrix_display(float ***bmat, int rank)
			
 
				 {
			
 
				-	unsigned i,j,x,y;
			
 
				-
			
 
				 	if (display)
			
 
				 	{
			
 
				+		unsigned y;
			
 
				 		printf("[%d] Input :\n", rank);
			
 
				 
			
 
				 		for(y=0 ; y<nblocks ; y++)
			
 
				 		{
			
 
				+			unsigned x;
			
 
				 			for(x=0 ; x<nblocks ; x++)
			
 
				 			{
			
 
				+				unsigned j;
			
 
				 				printf("Block %u,%u :\n", x, y);
			
 
				 				for (j = 0; j < BLOCKSIZE; j++)
			
 
				 				{
			
 
				+					unsigned i;
			
 
				 					for (i = 0; i < BLOCKSIZE; i++)
			
 
				 					{
			
 
				 						if (i <= j)
			
@@ -80,8 +79,10 @@ void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 
				 				{
			
 
				 					for (j = 0; j < BLOCKSIZE; j++)
			
 
				 					{
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 						(*bmat)[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
			
 
				 						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				+#endif
			
 
				 					}
			
 
				 				}
			
 
				 			}
			
--- a/nmad/examples/matrix_decomposition/mpi_decomposition_matrix.h
+++ b/nmad/examples/matrix_decomposition/mpi_decomposition_matrix.h
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2012  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/nmad/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010, 2015-2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,14 +15,25 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include "mpi_cholesky.h"
			
 
				 #include <string.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <stdio.h>
			
 
				 #include <math.h>
			
 
				 
			
 
				-unsigned size = 4*960;
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+unsigned size = 4*64;
			
 
				+unsigned nblocks = 2;
			
 
				+unsigned nbigblocks = 2;
			
 
				+#elif !defined(STARPU_LONG_CHECK)
			
 
				+unsigned size = 4*320;
			
 
				+unsigned nblocks = 4;
			
 
				+unsigned nbigblocks = 2;
			
 
				+#else
			
 
				+unsigned size = 16*320;
			
 
				 unsigned nblocks = 16;
			
 
				 unsigned nbigblocks = 2;
			
 
				+#endif
			
 
				 unsigned noprio = 0;
			
 
				 unsigned display = 0;
			
 
				 int dblockx = -1;
			
@@ -73,13 +84,14 @@ void parse_args(int argc, char **argv, int nodes)
 
				                         display = 1;
			
 
				                 }
			
 
				 
			
 
				-                if (strcmp(argv[i], "-h") == 0)
			
 
				+                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				                 {
			
 
				                         printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
			
 
				                 }
			
 
				         }
			
 
				 
			
 
				-        if (nblocks > size) nblocks = size;
			
 
				+        if (nblocks > size)
			
 
				+		nblocks = size;
			
 
				 
			
 
				 	if (dblockx == -1 || dblocky == -1)
			
 
				 	{
			
@@ -96,5 +108,6 @@ void parse_args(int argc, char **argv, int nodes)
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				+	FPRINTF(stdout, "size: %u - nblocks: %u - dblocksx: %d - dblocksy: %d\n", size, nblocks, dblockx, dblocky);
			
 
				 }
			
 
				 
			
--- a/nmad/examples/matrix_decomposition/mpi_decomposition_params.h
+++ b/nmad/examples/matrix_decomposition/mpi_decomposition_params.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,8 +25,8 @@ extern unsigned nblocks;
 
				 extern unsigned nbigblocks;
			
 
				 extern unsigned noprio;
			
 
				 extern unsigned display;
			
 
				-extern unsigned dblockx;
			
 
				-extern unsigned dblocky;
			
 
				+extern int dblockx;
			
 
				+extern int dblocky;
			
 
				 
			
 
				 void parse_args(int argc, char **argv, int nodes);
			
 
				 
			
--- a/nmad/examples/matrix_mult/Makefile
+++ b/nmad/examples/matrix_mult/Makefile
@@ -0,0 +1,30 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2016  Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+
			
 
				+# This makefile gives an example on how to build the testcase outside StarPU
			
 
				+
			
 
				+PRG	= mm
			
 
				+
			
 
				+CC	= mpicc
			
 
				+CFLAGS	= $(shell pkg-config --cflags starpumpi-1.3) -g -Wall
			
 
				+LDFLAGS	= $(shell pkg-config --libs starpumpi-1.3) -lm
			
 
				+
			
 
				+.phony: all clean
			
 
				+
			
 
				+all: $(PRG)
			
 
				+
			
 
				+clean:
			
 
				+	rm -f $(PRG) *.o starpu*.log
			
--- a/nmad/examples/matrix_mult/environment
+++ b/nmad/examples/matrix_mult/environment
@@ -0,0 +1,25 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2016  Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+
			
 
				+# This script gives an example on how to set environment variables to build and run the testcase outside StarPU
			
 
				+
			
 
				+STARPU_INSTALL_DIR=/usr # set this to StarPU's installation directory
			
 
				+
			
 
				+PATH=$STARPU_INSTALL_DIR/bin:$PATH
			
 
				+PKG_CONFIG_PATH=$STARPU_INSTALL_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+LD_LIBRARY_PATH=$STARPU_INSTALL_DIR/lib:$LD_LIBRARY_PATH
			
 
				+
			
 
				+export PATH PKG_CONFIG_PATH LD_LIBRARY_PATH
			
--- a/nmad/examples/matrix_mult/mm.c
+++ b/nmad/examples/matrix_mult/mm.c
@@ -0,0 +1,390 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2016  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example illustrates how to distribute a pre-existing data structure to
			
 
				+ * a set of computing nodes using StarPU-MPI routines.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#define VERBOSE 0
			
 
				+
			
 
				+static int N  = 16; /* Matrix size */
			
 
				+static int BS =  4; /* Block size */
			
 
				+
			
 
				+#define NB ((N)/(BS)) /* Number of blocks */
			
 
				+
			
 
				+/* Matrices. Will be allocated as regular, linearized C arrays */
			
 
				+static double *A = NULL; /* A will be partitioned as BS rows x N  cols blocks */
			
 
				+static double *B = NULL; /* B will be partitioned as N  rows x BS cols blocks */
			
 
				+static double *C = NULL; /* C will be partitioned as BS rows x BS cols blocks */
			
 
				+
			
 
				+/* Arrays of data handles for managing matrix blocks */
			
 
				+static starpu_data_handle_t *A_h;
			
 
				+static starpu_data_handle_t *B_h;
			
 
				+static starpu_data_handle_t *C_h;
			
 
				+
			
 
				+static int comm_rank; /* mpi rank of the process */
			
 
				+static int comm_size; /* size of the mpi session */
			
 
				+
			
 
				+static void alloc_matrices(void)
			
 
				+{
			
 
				+	/* Regular 'malloc' can also be used instead, however, starpu_malloc make sure that
			
 
				+	 * the area is allocated in suitably pinned memory to improve data transfers, especially
			
 
				+	 * with CUDA */
			
 
				+	starpu_malloc((void **)&A, N*N*sizeof(double));
			
 
				+	starpu_malloc((void **)&B, N*N*sizeof(double));
			
 
				+	starpu_malloc((void **)&C, N*N*sizeof(double));
			
 
				+}
			
 
				+
			
 
				+static void free_matrices(void)
			
 
				+{
			
 
				+	starpu_free(A);
			
 
				+	starpu_free(B);
			
 
				+	starpu_free(C);
			
 
				+}
			
 
				+
			
 
				+static void init_matrices(void)
			
 
				+{
			
 
				+	int row,col;
			
 
				+	for (row = 0; row < N; row++)
			
 
				+	{
			
 
				+		for (col = 0; col < N; col++)
			
 
				+		{
			
 
				+			A[row*N+col] = (row==col)?2:0;
			
 
				+			B[row*N+col] = row*N+col;
			
 
				+			C[row*N+col] = 0;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#if VERBOSE
			
 
				+static void disp_matrix(double *m)
			
 
				+{
			
 
				+	int row,col;
			
 
				+	for (row = 0; row < N; row++)
			
 
				+	{
			
 
				+		for (col = 0; col < N; col++)
			
 
				+		{
			
 
				+			printf("\t%.2lf", m[row*N+col]);
			
 
				+		}
			
 
				+		printf("\n");
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static void check_result(void)
			
 
				+{
			
 
				+	int row,col;
			
 
				+	for (row = 0; row < N; row++)
			
 
				+	{
			
 
				+		for (col = 0; col < N; col++)
			
 
				+		{
			
 
				+			if (fabs(C[row*N+col] - 2*(row*N+col)) > 1.0)
			
 
				+			{
			
 
				+				fprintf(stderr, "check failed\n");
			
 
				+				exit(1);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+#if VERBOSE
			
 
				+	printf("success\n");
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Register the matrix blocks to StarPU and to StarPU-MPI */
			
 
				+static void register_matrices()
			
 
				+{
			
 
				+	A_h = calloc(NB, sizeof(starpu_data_handle_t));
			
 
				+	B_h = calloc(NB, sizeof(starpu_data_handle_t));
			
 
				+	C_h = calloc(NB*NB, sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+	/* Memory region, where the data being registered resides.
			
 
				+	 * In this example, all blocks are allocated by node 0, thus
			
 
				+	 * - node 0 specifies STARPU_MAIN_RAM to indicate that it owns the block in its main memory
			
 
				+	 * - nodes !0 specify -1 to indicate that they don't have a copy of the block initially
			
 
				+	 */
			
 
				+	int mr = (comm_rank == 0) ? STARPU_MAIN_RAM : -1;
			
 
				+
			
 
				+	/* mpi tag used for the block */
			
 
				+	int tag = 0;
			
 
				+
			
 
				+	int b_row,b_col;
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		/* Register a block to StarPU */
			
 
				+		starpu_matrix_data_register(&A_h[b_row],
			
 
				+				mr,
			
 
				+				(comm_rank == 0)?(uintptr_t)(A+b_row*BS*N):0, N, N, BS,
			
 
				+				sizeof(double));
			
 
				+
			
 
				+		/* Register a block to StarPU-MPI, specifying the mpi tag to use for transfering the block
			
 
				+		 * and the rank of the owner node.
			
 
				+		 *
			
 
				+		 * Note: StarPU-MPI is an autonomous layer built on top of StarPU, hence the two separate
			
 
				+		 * registration steps.
			
 
				+		 */
			
 
				+		starpu_data_set_coordinates(A_h[b_row], 2, 0, b_row);
			
 
				+		starpu_mpi_data_register(A_h[b_row], tag++, 0);
			
 
				+	}
			
 
				+
			
 
				+	for (b_col = 0; b_col < NB; b_col++)
			
 
				+	{
			
 
				+		starpu_matrix_data_register(&B_h[b_col],
			
 
				+				mr,
			
 
				+				(comm_rank == 0)?(uintptr_t)(B+b_col*BS):0, N, BS, N,
			
 
				+				sizeof(double));
			
 
				+		starpu_data_set_coordinates(B_h[b_col], 2, b_col, 0);
			
 
				+		starpu_mpi_data_register(B_h[b_col], tag++, 0);
			
 
				+	}
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				+			starpu_matrix_data_register(&C_h[b_row*NB+b_col],
			
 
				+					mr,
			
 
				+					(comm_rank == 0)?(uintptr_t)(C+b_row*BS*N+b_col*BS):0, N, BS, BS,
			
 
				+					sizeof(double));
			
 
				+			starpu_data_set_coordinates(C_h[b_row*NB+b_col], 2, b_col, b_row);
			
 
				+			starpu_mpi_data_register(C_h[b_row*NB+b_col], tag++, 0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Transfer ownership of the C matrix blocks following some user-defined distribution over the nodes.
			
 
				+ * Note: since C will be Write-accessed, it will implicitly define which node perform the task
			
 
				+ * associated to a given block. */
			
 
				+static void distribute_matrix_C(void)
			
 
				+{
			
 
				+	int b_row,b_col;
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				+			starpu_data_handle_t h = C_h[b_row*NB+b_col]; 
			
 
				+
			
 
				+			/* Select the node where the block should be computed. */
			
 
				+			int target_rank = (b_row+b_col)%comm_size;
			
 
				+
			
 
				+			/* Move the block on to its new owner. */
			
 
				+			starpu_mpi_data_migrate(MPI_COMM_WORLD, h, target_rank);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Transfer ownership of the C matrix blocks back to node 0, for display purpose. This is not mandatory. */
			
 
				+static void undistribute_matrix_C(void)
			
 
				+{
			
 
				+	int b_row,b_col;
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				+			starpu_data_handle_t h = C_h[b_row*NB+b_col]; 
			
 
				+			starpu_mpi_data_migrate(MPI_COMM_WORLD, h, 0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Unregister matrices from the StarPU management. */
			
 
				+static void unregister_matrices()
			
 
				+{
			
 
				+	int b_row,b_col;
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		starpu_data_unregister(A_h[b_row]);
			
 
				+	}
			
 
				+
			
 
				+	for (b_col = 0; b_col < NB; b_col++)
			
 
				+	{
			
 
				+		starpu_data_unregister(B_h[b_col]);
			
 
				+	}
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				+			starpu_data_unregister(C_h[b_row*NB+b_col]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(A_h);
			
 
				+	free(B_h);
			
 
				+	free(C_h);
			
 
				+}
			
 
				+
			
 
				+/* Perform the actual computation. In a real-life case, this would rather call a BLAS 'gemm' routine
			
 
				+ * instead. */
			
 
				+static void cpu_mult(void *handles[], STARPU_ATTRIBUTE_UNUSED void *arg)
			
 
				+{
			
 
				+	double *block_A = (double *)STARPU_MATRIX_GET_PTR(handles[0]);
			
 
				+	double *block_B = (double *)STARPU_MATRIX_GET_PTR(handles[1]);
			
 
				+	double *block_C = (double *)STARPU_MATRIX_GET_PTR(handles[2]);
			
 
				+
			
 
				+	unsigned n_col_A = STARPU_MATRIX_GET_NX(handles[0]);
			
 
				+	unsigned n_col_B = STARPU_MATRIX_GET_NX(handles[1]);
			
 
				+	unsigned n_col_C = STARPU_MATRIX_GET_NX(handles[2]);
			
 
				+
			
 
				+	unsigned n_row_A = STARPU_MATRIX_GET_NY(handles[0]);
			
 
				+	unsigned n_row_B = STARPU_MATRIX_GET_NY(handles[1]);
			
 
				+	unsigned n_row_C = STARPU_MATRIX_GET_NY(handles[2]);
			
 
				+
			
 
				+	unsigned ld_A = STARPU_MATRIX_GET_LD(handles[0]);
			
 
				+	unsigned ld_B = STARPU_MATRIX_GET_LD(handles[1]);
			
 
				+	unsigned ld_C = STARPU_MATRIX_GET_LD(handles[2]);
			
 
				+
			
 
				+	/* Sanity check, not needed in real life case */
			
 
				+	assert(n_col_C == n_col_B);
			
 
				+	assert(n_row_C == n_row_A);
			
 
				+	assert(n_col_A == n_row_B);
			
 
				+
			
 
				+	unsigned i,j,k;
			
 
				+	for (k = 0; k < n_row_C; k++)
			
 
				+	{
			
 
				+		for (j = 0; j < n_col_C; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < n_col_A; i++)
			
 
				+			{
			
 
				+				block_C[k*ld_C+j] += block_A[k*ld_A+i] * block_B[i*ld_B+j]; 
			
 
				+			}
			
 
				+
			
 
				+#if VERBOSE
			
 
				+			/* For illustration purpose, shows which node computed
			
 
				+			 * the block in the decimal part of the cell */
			
 
				+			block_C[k*ld_C+j] += comm_rank / 100.0;
			
 
				+#endif
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Define a StarPU 'codelet' structure for the matrix multiply kernel above.
			
 
				+ * This structure enable specifying multiple implementations for the kernel (such as CUDA or OpenCL versions)
			
 
				+ */
			
 
				+static struct starpu_codelet gemm_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_mult}, /* cpu implementation(s) of the routine */
			
 
				+	.nbuffers = 3, /* number of data handles referenced by this routine */
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW} /* access modes for each data handle */
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	/* Initializes the StarPU core */
			
 
				+	int ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	/* Initializes the StarPU-MPI layer */
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	if (starpu_cpu_worker_get_count() == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "We need at least 1 CPU worker.\n");
			
 
				+		starpu_mpi_shutdown();
			
 
				+		starpu_shutdown();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	/* Parse the matrix size and block size optional args */
			
 
				+	if (argc > 1)
			
 
				+	{
			
 
				+		N = atoi(argv[1]);
			
 
				+		if (N < 1)
			
 
				+		{
			
 
				+			fprintf(stderr, "invalid matrix size\n");
			
 
				+			exit(1);
			
 
				+		}
			
 
				+		if (argc > 2)
			
 
				+		{
			
 
				+			BS = atoi(argv[2]);
			
 
				+		}
			
 
				+		if (BS < 1 || N % BS != 0)
			
 
				+		{
			
 
				+			fprintf(stderr, "invalid block size\n");
			
 
				+			exit(1);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Get the process rank and session size */
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
			
 
				+
			
 
				+	if (comm_rank == 0)
			
 
				+	{
			
 
				+#if VERBOSE
			
 
				+		printf("N = %d\n", N);
			
 
				+		printf("BS = %d\n", BS);
			
 
				+		printf("NB = %d\n", NB);
			
 
				+		printf("comm_size = %d\n", comm_size);
			
 
				+#endif
			
 
				+		/* In this example, node rank 0 performs all the memory allocations and initializations,
			
 
				+		 * and the blocks are later distributed on the other nodes.
			
 
				+		 * This is not mandatory however, and blocks could be allocated on other nodes right
			
 
				+		 * from the beginning, depending on the application needs (in particular for the case
			
 
				+		 * where the session wide data footprint is larger than a single node available memory. */
			
 
				+		alloc_matrices();
			
 
				+		init_matrices();
			
 
				+	}
			
 
				+
			
 
				+	/* Register matrices to StarPU and StarPU-MPI */
			
 
				+	register_matrices();
			
 
				+	/* Distribute C blocks */
			
 
				+	distribute_matrix_C();
			
 
				+
			
 
				+	int b_row,b_col;
			
 
				+
			
 
				+	for (b_row = 0; b_row < NB; b_row++)
			
 
				+	{
			
 
				+		for (b_col = 0; b_col < NB; b_col++)
			
 
				+		{
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
			
 
				+					STARPU_R,  A_h[b_row],
			
 
				+					STARPU_R,  B_h[b_col],
			
 
				+					STARPU_RW, C_h[b_row*NB+b_col],
			
 
				+					0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	undistribute_matrix_C();
			
 
				+	unregister_matrices();
			
 
				+
			
 
				+	if (comm_rank == 0)
			
 
				+	{
			
 
				+#if VERBOSE
			
 
				+		disp_matrix(C);
			
 
				+#endif
			
 
				+		check_result();
			
 
				+		free_matrices();
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
--- a/nmad/examples/mpi_lu/mpi_lu-double.h
+++ b/nmad/examples/mpi_lu/mpi_lu-double.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/mpi_lu-float.h
+++ b/nmad/examples/mpi_lu/mpi_lu-float.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/pdlu.c
+++ b/nmad/examples/mpi_lu/pdlu.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/pdlu_implicit.c
+++ b/nmad/examples/mpi_lu/pdlu_implicit.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-double.h"
			
 
				+#include "pxlu_implicit.c"
			
--- a/nmad/examples/mpi_lu/pdlu_kernels.c
+++ b/nmad/examples/mpi_lu/pdlu_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/plu_example.c
+++ b/nmad/examples/mpi_lu/plu_example.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011, 2013  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011, 2013, 2015, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -67,31 +67,35 @@ static starpu_data_handle_t *(tmp_21_block_handles[2]);
 
				 static TYPE **(tmp_21_block[2]);
			
 
				 #endif
			
 
				 
			
 
				-int get_block_rank(unsigned i, unsigned j);
			
 
				-
			
 
				 static void parse_args(int rank, int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			size = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				+		if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				 			check = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-display") == 0) {
			
 
				+		if (strcmp(argv[i], "-display") == 0)
			
 
				+		{
			
 
				 			display = 1;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-numa") == 0) {
			
 
				+		if (strcmp(argv[i], "-numa") == 0)
			
 
				+		{
			
 
				 #ifdef STARPU_HAVE_LIBNUMA
			
 
				 			numa = 1;
			
 
				 #else
			
@@ -100,17 +104,20 @@ static void parse_args(int rank, int argc, char **argv)
 
				 #endif
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-p") == 0) {
			
 
				+		if (strcmp(argv[i], "-p") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			p = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-q") == 0) {
			
 
				+		if (strcmp(argv[i], "-q") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			q = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				+		{
			
 
				 			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
			
 
				 			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				 			exit(0);
			
@@ -248,11 +255,13 @@ static void init_matrix(int rank)
 
				 				}
			
 
				 
			
 
				 				/* Register it to StarPU */
			
 
				-				starpu_matrix_data_register(handleptr, 0,
			
 
				+				starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM,
			
 
				 					(uintptr_t)*blockptr, size/nblocks,
			
 
				 					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+				starpu_data_set_coordinates(*handleptr, 2, j, i);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				*blockptr = STARPU_POISON_PTR;
			
 
				 				*handleptr = STARPU_POISON_PTR;
			
 
				 			}
			
@@ -267,7 +276,7 @@ static void init_matrix(int rank)
 
				 #ifdef SINGLE_TMP11
			
 
				 	starpu_malloc((void **)&tmp_11_block, blocksize);
			
 
				 	allocated_memory_extra += blocksize;
			
 
				-	starpu_matrix_data_register(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
			
 
				+	starpu_matrix_data_register(&tmp_11_block_handle, STARPU_MAIN_RAM, (uintptr_t)tmp_11_block,
			
 
				 			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 #else
			
 
				 	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
			
@@ -282,7 +291,7 @@ static void init_matrix(int rank)
 
				 			allocated_memory_extra += blocksize;
			
 
				 			STARPU_ASSERT(tmp_11_block[k]);
			
 
				 
			
 
				-			starpu_matrix_data_register(&tmp_11_block_handles[k], 0,
			
 
				+			starpu_matrix_data_register(&tmp_11_block_handles[k], STARPU_MAIN_RAM,
			
 
				 				(uintptr_t)tmp_11_block[k],
			
 
				 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 		}
			
@@ -298,7 +307,8 @@ static void init_matrix(int rank)
 
				 
			
 
				 	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
			
 
				 #else
			
 
				-	for (i = 0; i < 2; i++) {
			
 
				+	for (i = 0; i < 2; i++)
			
 
				+	{
			
 
				 		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				 		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
			
 
				 		tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *));
			
@@ -317,7 +327,7 @@ static void init_matrix(int rank)
 
				 			allocated_memory_extra += blocksize;
			
 
				 			STARPU_ASSERT(tmp_12_block[k]);
			
 
				 
			
 
				-			starpu_matrix_data_register(&tmp_12_block_handles[k], 0,
			
 
				+			starpu_matrix_data_register(&tmp_12_block_handles[k], STARPU_MAIN_RAM,
			
 
				 				(uintptr_t)tmp_12_block[k],
			
 
				 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 		}
			
@@ -328,19 +338,20 @@ static void init_matrix(int rank)
 
				 			allocated_memory_extra += blocksize;
			
 
				 			STARPU_ASSERT(tmp_21_block[k]);
			
 
				 
			
 
				-			starpu_matrix_data_register(&tmp_21_block_handles[k], 0,
			
 
				+			starpu_matrix_data_register(&tmp_21_block_handles[k], STARPU_MAIN_RAM,
			
 
				 				(uintptr_t)tmp_21_block[k],
			
 
				 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 		}
			
 
				 #else
			
 
				-	for (i = 0; i < 2; i++) {
			
 
				+	for (i = 0; i < 2; i++)
			
 
				+	{
			
 
				 		if (tmp_12_block_is_needed(rank, nblocks, k))
			
 
				 		{
			
 
				 			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
			
 
				 			allocated_memory_extra += blocksize;
			
 
				 			STARPU_ASSERT(tmp_12_block[i][k]);
			
 
				 
			
 
				-			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
			
 
				+			starpu_matrix_data_register(&tmp_12_block_handles[i][k], STARPU_MAIN_RAM,
			
 
				 				(uintptr_t)tmp_12_block[i][k],
			
 
				 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 		}
			
@@ -351,7 +362,7 @@ static void init_matrix(int rank)
 
				 			allocated_memory_extra += blocksize;
			
 
				 			STARPU_ASSERT(tmp_21_block[i][k]);
			
 
				 
			
 
				-			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
			
 
				+			starpu_matrix_data_register(&tmp_21_block_handles[i][k], STARPU_MAIN_RAM,
			
 
				 				(uintptr_t)tmp_21_block[i][k],
			
 
				 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 		}
			
@@ -412,7 +423,8 @@ int main(int argc, char **argv)
 
				 	 *	Initialization
			
 
				 	 */
			
 
				 	int thread_support;
			
 
				-	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
			
 
				+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
			
 
				+	{
			
 
				 		fprintf(stderr,"MPI_Init_thread failed\n");
			
 
				 		exit(1);
			
 
				 	}
			
@@ -421,8 +433,8 @@ int main(int argc, char **argv)
 
				 	if (thread_support < MPI_THREAD_FUNNELED)
			
 
				 		fprintf(stderr,"Warning: MPI does not have thread support!\n");
			
 
				 
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
			
 
				 
			
 
				 	starpu_srand48((long int)time(NULL));
			
 
				 
			
@@ -434,7 +446,8 @@ int main(int argc, char **argv)
 
				 	/* We disable sequential consistency in this example */
			
 
				 	starpu_data_set_default_sequential_consistency_flag(0);
			
 
				 
			
 
				-	starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				 	STARPU_ASSERT(p*q == world_size);
			
 
				 
			
@@ -459,10 +472,10 @@ int main(int argc, char **argv)
 
				 	TYPE *a_r = NULL;
			
 
				 //	STARPU_PLU(display_data_content)(a_r, size);
			
 
				 
			
 
				-	TYPE *x, *y;
			
 
				-
			
 
				 	if (check)
			
 
				 	{
			
 
				+		TYPE *x, *y;
			
 
				+
			
 
				 		x = calloc(size, sizeof(TYPE));
			
 
				 		STARPU_ASSERT(x);
			
 
				 
			
@@ -482,6 +495,9 @@ int main(int argc, char **argv)
 
				 			STARPU_PLU(display_data_content)(a_r, size);
			
 
				 
			
 
				 //		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
			
 
				+
			
 
				+		free(x);
			
 
				+		free(y);
			
 
				 	}
			
 
				 
			
 
				 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
--- a/nmad/examples/mpi_lu/plu_example_double.c
+++ b/nmad/examples/mpi_lu/plu_example_double.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/plu_example_float.c
+++ b/nmad/examples/mpi_lu/plu_example_float.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/plu_implicit_example.c
+++ b/nmad/examples/mpi_lu/plu_implicit_example.c
@@ -0,0 +1,369 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011, 2013, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <time.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+//#include "pxlu_kernels.h"
			
 
				+
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+#include <numaif.h>
			
 
				+#endif
			
 
				+
			
 
				+static unsigned long size = 4096;
			
 
				+static unsigned nblocks = 16;
			
 
				+static unsigned check = 0;
			
 
				+static int p = 1;
			
 
				+static int q = 1;
			
 
				+static unsigned display = 0;
			
 
				+
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+static unsigned numa = 0;
			
 
				+#endif
			
 
				+
			
 
				+static size_t allocated_memory = 0;
			
 
				+static size_t allocated_memory_extra = 0;
			
 
				+
			
 
				+static starpu_data_handle_t *dataA_handles;
			
 
				+static TYPE **dataA;
			
 
				+
			
 
				+int get_block_rank(unsigned i, unsigned j);
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			size = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				+			check = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-display") == 0)
			
 
				+		{
			
 
				+			display = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-numa") == 0)
			
 
				+		{
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+			numa = 1;
			
 
				+#else
			
 
				+			fprintf(stderr, "Warning: libnuma is not available\n");
			
 
				+#endif
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-p") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			p = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-q") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			q = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				+		{
			
 
				+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
			
 
				+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+unsigned STARPU_PLU(display_flag)(void)
			
 
				+{
			
 
				+	return display;
			
 
				+}
			
 
				+
			
 
				+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
			
 
				+{
			
 
				+	const unsigned block_size = (psize/pnblocks);
			
 
				+
			
 
				+	unsigned i, j;
			
 
				+	for (i = 0; i < block_size; i++)
			
 
				+	     for (j = 0; j < block_size; j++)
			
 
				+	     {
			
 
				+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
			
 
				+	     }
			
 
				+}
			
 
				+
			
 
				+static void init_matrix(int rank)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+	if (numa)
			
 
				+	{
			
 
				+		fprintf(stderr, "Using INTERLEAVE policy\n");
			
 
				+		unsigned long nodemask = ((1<<0)|(1<<1));
			
 
				+		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
			
 
				+		if (ret)
			
 
				+			perror("set_mempolicy failed");
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
			
 
				+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
			
 
				+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
			
 
				+	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
			
 
				+
			
 
				+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				+
			
 
				+	/* Allocate all the blocks that belong to this mpi node */
			
 
				+	unsigned long i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			int block_rank = get_block_rank(i, j);
			
 
				+			TYPE **blockptr = &dataA[j+i*nblocks];
			
 
				+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				+
			
 
				+			if (block_rank == rank)
			
 
				+			{
			
 
				+				/* This blocks should be treated by the current MPI process */
			
 
				+				/* Allocate and fill it */
			
 
				+				starpu_malloc((void **)blockptr, blocksize);
			
 
				+				allocated_memory += blocksize;
			
 
				+
			
 
				+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
			
 
				+				fill_block_with_random(*blockptr, size, nblocks);
			
 
				+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
			
 
				+				if (i == j)
			
 
				+				{
			
 
				+					unsigned tmp;
			
 
				+					for (tmp = 0; tmp < size/nblocks; tmp++)
			
 
				+					{
			
 
				+						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				/* Register it to StarPU */
			
 
				+				starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM,
			
 
				+					(uintptr_t)*blockptr, size/nblocks,
			
 
				+					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				starpu_matrix_data_register(handleptr, -1,
			
 
				+					0, size/nblocks,
			
 
				+					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+				*blockptr = STARPU_POISON_PTR;
			
 
				+			}
			
 
				+			starpu_data_set_coordinates(*handleptr, 2, j, i);
			
 
				+			starpu_mpi_data_register(*handleptr, j+i*nblocks, block_rank);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	//display_all_blocks(nblocks, size/nblocks);
			
 
				+}
			
 
				+
			
 
				+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
			
 
				+{
			
 
				+	return dataA[j+i*nblocks];
			
 
				+}
			
 
				+
			
 
				+int get_block_rank(unsigned i, unsigned j)
			
 
				+{
			
 
				+	/* Take a 2D block cyclic distribution */
			
 
				+	/* NB: p (resp. q) is for "direction" i (resp. j) */
			
 
				+	return (j % q) * p + (i % p);
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
			
 
				+{
			
 
				+	return dataA_handles[j+i*nblocks];
			
 
				+}
			
 
				+
			
 
				+static void display_grid(int rank, unsigned pnblocks)
			
 
				+{
			
 
				+	if (!display)
			
 
				+		return;
			
 
				+
			
 
				+	//if (rank == 0)
			
 
				+	{
			
 
				+		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
			
 
				+
			
 
				+		unsigned i, j;
			
 
				+		for (j = 0; j < pnblocks; j++)
			
 
				+		{
			
 
				+			for (i = 0; i < pnblocks; i++)
			
 
				+			{
			
 
				+				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
			
 
				+				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
			
 
				+
			
 
				+				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
			
 
				+			}
			
 
				+			fprintf(stderr, "\n");
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int world_size;
			
 
				+
			
 
				+	starpu_srand48((long int)time(NULL));
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	int ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
			
 
				+
			
 
				+	STARPU_ASSERT(p*q == world_size);
			
 
				+
			
 
				+	starpu_cublas_init();
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Problem Init
			
 
				+	 */
			
 
				+
			
 
				+	init_matrix(rank);
			
 
				+
			
 
				+	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
			
 
				+                        (int)(allocated_memory/(1024*1024)),
			
 
				+			(int)(allocated_memory_extra/(1024*1024)),
			
 
				+                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));
			
 
				+
			
 
				+	display_grid(rank, nblocks);
			
 
				+
			
 
				+	TYPE *a_r = NULL;
			
 
				+//	STARPU_PLU(display_data_content)(a_r, size);
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		TYPE *x, *y;
			
 
				+
			
 
				+		x = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(x);
			
 
				+
			
 
				+		y = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(y);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			unsigned ind;
			
 
				+			for (ind = 0; ind < size; ind++)
			
 
				+				x[ind] = (TYPE)starpu_drand48();
			
 
				+		}
			
 
				+
			
 
				+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+			STARPU_PLU(display_data_content)(a_r, size);
			
 
				+
			
 
				+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
			
 
				+
			
 
				+		free(x);
			
 
				+		free(y);
			
 
				+	}
			
 
				+
			
 
				+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Report performance
			
 
				+	 */
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
			
 
				+
			
 
				+		unsigned n = size;
			
 
				+		double flop = (2.0f*n*n*n)/3.0f;
			
 
				+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 *	Test Result Correctness
			
 
				+	 */
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		/*
			
 
				+		 *	Compute || A - LU ||
			
 
				+		 */
			
 
				+
			
 
				+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
			
 
				+
			
 
				+#if 0
			
 
				+		/*
			
 
				+		 *	Compute || Ax - LUx ||
			
 
				+		 */
			
 
				+
			
 
				+		unsigned ind;
			
 
				+
			
 
				+		y2 = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(y);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			for (ind = 0; ind < size; ind++)
			
 
				+			{
			
 
				+				y2[ind] = (TYPE)0.0;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
			
 
				+
			
 
				+		/* Compute y2 = y2 - y */
			
 
				+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
			
 
				+
			
 
				+		TYPE err = CPU_ASUM(size, y2, 1);
			
 
				+		int max = CPU_IAMAX(size, y2, 1);
			
 
				+
			
 
				+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
			
 
				+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Termination
			
 
				+	 */
			
 
				+
			
 
				+	starpu_cublas_shutdown();
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/examples/mpi_lu/plu_implicit_example_double.c
+++ b/nmad/examples/mpi_lu/plu_implicit_example_double.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-double.h"
			
 
				+#include "plu_implicit_example.c"
			
--- a/nmad/examples/mpi_lu/plu_implicit_example_float.c
+++ b/nmad/examples/mpi_lu/plu_implicit_example_float.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-float.h"
			
 
				+#include "plu_implicit_example.c"
			
--- a/nmad/examples/mpi_lu/plu_outofcore_example.c
+++ b/nmad/examples/mpi_lu/plu_outofcore_example.c
@@ -0,0 +1,402 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011, 2013-2014, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <unistd.h>
			
 
				+#include <string.h>
			
 
				+#include <time.h>
			
 
				+#include <math.h>
			
 
				+#include <starpu.h>
			
 
				+#include <fcntl.h>
			
 
				+#include <sys/stat.h>
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+//#include "pxlu_kernels.h"
			
 
				+
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+#include <numaif.h>
			
 
				+#endif
			
 
				+
			
 
				+static unsigned long size = 4096;
			
 
				+static unsigned nblocks = 16;
			
 
				+static unsigned check = 0;
			
 
				+static int p = 1;
			
 
				+static int q = 1;
			
 
				+static unsigned display = 0;
			
 
				+static char *path = "./starpu-ooc-files";
			
 
				+
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+static unsigned numa = 0;
			
 
				+#endif
			
 
				+
			
 
				+static size_t allocated_memory = 0;
			
 
				+
			
 
				+static starpu_data_handle_t *dataA_handles;
			
 
				+
			
 
				+int get_block_rank(unsigned i, unsigned j);
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			size = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				+			check = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-display") == 0)
			
 
				+		{
			
 
				+			display = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-numa") == 0)
			
 
				+		{
			
 
				+#ifdef STARPU_HAVE_LIBNUMA
			
 
				+			numa = 1;
			
 
				+#else
			
 
				+			fprintf(stderr, "Warning: libnuma is not available\n");
			
 
				+#endif
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-p") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			p = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-q") == 0)
			
 
				+		{
			
 
				+			char *argptr;
			
 
				+			q = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-path") == 0)
			
 
				+		{
			
 
				+			path = argv[++i];
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				+		{
			
 
				+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q] [-path PATH]\n", argv[0]);
			
 
				+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+unsigned STARPU_PLU(display_flag)(void)
			
 
				+{
			
 
				+	return display;
			
 
				+}
			
 
				+
			
 
				+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
			
 
				+{
			
 
				+	const unsigned block_size = (psize/pnblocks);
			
 
				+
			
 
				+	unsigned i, j;
			
 
				+	for (i = 0; i < block_size; i++)
			
 
				+	     for (j = 0; j < block_size; j++)
			
 
				+	     {
			
 
				+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
			
 
				+	     }
			
 
				+}
			
 
				+
			
 
				+static void create_matrix()
			
 
				+{
			
 
				+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				+	TYPE *blockptr = malloc(blocksize);
			
 
				+	int fd;
			
 
				+	char *filename;
			
 
				+	unsigned filename_length = strlen(path) + 1 + sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1;
			
 
				+
			
 
				+	filename = malloc(filename_length);
			
 
				+
			
 
				+	allocated_memory += nblocks*nblocks*blocksize;
			
 
				+
			
 
				+	/* Create the whole matrix on the disk */
			
 
				+	unsigned i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			fill_block_with_random(blockptr, size, nblocks);
			
 
				+			if (i == j)
			
 
				+			{
			
 
				+				unsigned tmp;
			
 
				+				for (tmp = 0; tmp < size/nblocks; tmp++)
			
 
				+				{
			
 
				+					blockptr[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
			
 
				+				}
			
 
				+			}
			
 
				+			snprintf(filename, filename_length, "%s/%u,%u", path, i, j);
			
 
				+			fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0777);
			
 
				+			if (fd < 0)
			
 
				+			{
			
 
				+				perror("open");
			
 
				+				exit(1);
			
 
				+			}
			
 
				+			if (write(fd, blockptr, blocksize) != (starpu_ssize_t) blocksize)
			
 
				+			{
			
 
				+				fprintf(stderr,"short write");
			
 
				+				exit(1);
			
 
				+			}
			
 
				+			if (close(fd) < 0)
			
 
				+			{
			
 
				+				perror("close");
			
 
				+				exit(1);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(blockptr);
			
 
				+	free(filename);
			
 
				+}
			
 
				+
			
 
				+static void init_matrix(int rank)
			
 
				+{
			
 
				+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
			
 
				+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
			
 
				+
			
 
				+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				+
			
 
				+	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
			
 
				+	assert(disk_node >= 0);
			
 
				+
			
 
				+	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
			
 
				+
			
 
				+	/* Allocate all the blocks that belong to this mpi node */
			
 
				+	unsigned i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			int block_rank = get_block_rank(i, j);
			
 
				+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				+
			
 
				+			if (block_rank == rank)
			
 
				+			{
			
 
				+				void *disk_obj;
			
 
				+				snprintf(filename, sizeof(filename), "%u,%u", i, j);
			
 
				+				/* Register it to StarPU */
			
 
				+				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
			
 
				+				if (!disk_obj)
			
 
				+				{
			
 
				+					fprintf(stderr,"could not open %s\n", filename);
			
 
				+					exit(1);
			
 
				+				}
			
 
				+				starpu_matrix_data_register(handleptr, disk_node,
			
 
				+					(uintptr_t) disk_obj, size/nblocks,
			
 
				+					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				starpu_matrix_data_register(handleptr, -1,
			
 
				+					0, size/nblocks,
			
 
				+					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				+			}
			
 
				+			starpu_data_set_coordinates(*handleptr, 2, j, i);
			
 
				+			starpu_mpi_data_register(*handleptr, j+i*nblocks, block_rank);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	//display_all_blocks(nblocks, size/nblocks);
			
 
				+}
			
 
				+
			
 
				+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
			
 
				+{
			
 
				+	/* This does not really make sense in out of core */
			
 
				+	assert(0);
			
 
				+}
			
 
				+
			
 
				+int get_block_rank(unsigned i, unsigned j)
			
 
				+{
			
 
				+	/* Take a 2D block cyclic distribution */
			
 
				+	/* NB: p (resp. q) is for "direction" i (resp. j) */
			
 
				+	return (j % q) * p + (i % p);
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
			
 
				+{
			
 
				+	return dataA_handles[j+i*nblocks];
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank;
			
 
				+	int world_size;
			
 
				+	int ret;
			
 
				+	unsigned i, j;
			
 
				+
			
 
				+	starpu_srand48((long int)time(NULL));
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	ret = mkdir(path, 0777);
			
 
				+	if (ret != 0 && errno != EEXIST)
			
 
				+	{
			
 
				+		fprintf(stderr,"%s does not exist\n", path);
			
 
				+		exit(1);
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
			
 
				+
			
 
				+	STARPU_ASSERT(p*q == world_size);
			
 
				+
			
 
				+	starpu_cublas_init();
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Problem Init
			
 
				+	 */
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+		create_matrix();
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	init_matrix(rank);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+		fprintf(stderr, "%dMB on disk\n", (int)(allocated_memory/(1024*1024)));
			
 
				+
			
 
				+	TYPE *a_r = NULL;
			
 
				+//	STARPU_PLU(display_data_content)(a_r, size);
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		TYPE *x, *y;
			
 
				+
			
 
				+		x = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(x);
			
 
				+
			
 
				+		y = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(y);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			unsigned ind;
			
 
				+			for (ind = 0; ind < size; ind++)
			
 
				+				x[ind] = (TYPE)starpu_drand48();
			
 
				+		}
			
 
				+
			
 
				+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+			STARPU_PLU(display_data_content)(a_r, size);
			
 
				+
			
 
				+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
			
 
				+
			
 
				+		free(x);
			
 
				+		free(y);
			
 
				+	}
			
 
				+
			
 
				+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Report performance
			
 
				+	 */
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
			
 
				+
			
 
				+		unsigned n = size;
			
 
				+		double flop = (2.0f*n*n*n)/3.0f;
			
 
				+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 *	Test Result Correctness
			
 
				+	 */
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		/*
			
 
				+		 *	Compute || A - LU ||
			
 
				+		 */
			
 
				+
			
 
				+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
			
 
				+
			
 
				+#if 0
			
 
				+		/*
			
 
				+		 *	Compute || Ax - LUx ||
			
 
				+		 */
			
 
				+
			
 
				+		unsigned ind;
			
 
				+
			
 
				+		y2 = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(y);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			for (ind = 0; ind < size; ind++)
			
 
				+			{
			
 
				+				y2[ind] = (TYPE)0.0;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
			
 
				+
			
 
				+		/* Compute y2 = y2 - y */
			
 
				+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
			
 
				+
			
 
				+		TYPE err = CPU_ASUM(size, y2, 1);
			
 
				+		int max = CPU_IAMAX(size, y2, 1);
			
 
				+
			
 
				+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
			
 
				+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * 	Termination
			
 
				+	 */
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < nblocks; i++)
			
 
				+		{
			
 
				+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	starpu_cublas_shutdown();
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/nmad/examples/mpi_lu/plu_outofcore_example_double.c
+++ b/nmad/examples/mpi_lu/plu_outofcore_example_double.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-double.h"
			
 
				+#include "plu_outofcore_example.c"
			
--- a/nmad/examples/mpi_lu/plu_outofcore_example_float.c
+++ b/nmad/examples/mpi_lu/plu_outofcore_example_float.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-float.h"
			
 
				+#include "plu_outofcore_example.c"
			
--- a/nmad/examples/mpi_lu/plu_solve.c
+++ b/nmad/examples/mpi_lu/plu_solve.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -233,13 +233,13 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 
				 	unsigned block_size = size/nblocks;
			
 
				 
			
 
				 	int rank;
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 
			
 
				 	unsigned bi, bj;
			
 
				 	for (bj = 0; bj < nblocks; bj++)
			
 
				 	for (bi = 0; bi < nblocks; bi++)
			
 
				 	{
			
 
				-		TYPE *block;
			
 
				+		TYPE *block = NULL;
			
 
				 
			
 
				 		int block_rank = get_block_rank(bi, bj);
			
 
				 
			
@@ -247,7 +247,8 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 
				 		{
			
 
				 			block = STARPU_PLU(get_block)(bi, bj);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			MPI_Status status;
			
 
				 
			
 
				 			if (rank == 0)
			
@@ -257,7 +258,8 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 
				 				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status);
			
 
				 				STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				 			}
			
 
				-			else if (rank == block_rank) {
			
 
				+			else if (rank == block_rank)
			
 
				+			{
			
 
				 				block = STARPU_PLU(get_block)(bi, bj);
			
 
				 				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
			
 
				 				STARPU_ASSERT(ret == MPI_SUCCESS);
			
@@ -331,7 +333,7 @@ void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved
 
				 	unsigned display = STARPU_PLU(display_flag)();
			
 
				 
			
 
				 	int rank;
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
@@ -390,4 +392,6 @@ void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved
 
				 
			
 
				 		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
			
 
				 	}
			
 
				+
			
 
				+	free(all_r);
			
 
				 }
			
--- a/nmad/examples/mpi_lu/plu_solve_double.c
+++ b/nmad/examples/mpi_lu/plu_solve_double.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/plu_solve_float.c
+++ b/nmad/examples/mpi_lu/plu_solve_float.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/pslu.c
+++ b/nmad/examples/mpi_lu/pslu.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/pslu_implicit.c
+++ b/nmad/examples/mpi_lu/pslu_implicit.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "mpi_lu-float.h"
			
 
				+#include "pxlu_implicit.c"
			
--- a/nmad/examples/mpi_lu/pslu_kernels.c
+++ b/nmad/examples/mpi_lu/pslu_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/pxlu.c
+++ b/nmad/examples/mpi_lu/pxlu.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011, 2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2014, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2013, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -58,7 +58,8 @@ static unsigned nblocks = 0;
 
				 static int rank = -1;
			
 
				 static int world_size = -1;
			
 
				 
			
 
				-struct callback_arg {
			
 
				+struct callback_arg
			
 
				+{
			
 
				 	unsigned i, j, k;
			
 
				 };
			
 
				 
			
@@ -104,7 +105,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 
				 	int r;
			
 
				 	for (r = 0; r < world_size; r++)
			
 
				 	{
			
 
				-		if (rank_mask[r]) {
			
 
				+		if (rank_mask[r])
			
 
				+		{
			
 
				 			rank_array[cnt] = r;
			
 
				 
			
 
				 			comm_array[cnt] = MPI_COMM_WORLD;
			
@@ -120,7 +122,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 
				 		 * once */
			
 
				 		starpu_tag_notify_from_apps(tag);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
			
 
				 				rank_array, mpi_tag_array, comm_array, tag);
			
 
				 	}
			
@@ -129,7 +132,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 
				 /* Initiate a receive request once all dependencies are fulfilled and unlock
			
 
				  * tag 'unlocked_tag' once it's done. */
			
 
				 
			
 
				-struct recv_when_done_callback_arg {
			
 
				+struct recv_when_done_callback_arg
			
 
				+{
			
 
				 	int source;
			
 
				 	int mpi_tag;
			
 
				 	starpu_data_handle_t handle;
			
@@ -156,7 +160,7 @@ static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
 
				 
			
 
				 	struct recv_when_done_callback_arg *arg =
			
 
				 		malloc(sizeof(struct recv_when_done_callback_arg));
			
 
				-	
			
 
				+
			
 
				 	arg->source = source;
			
 
				 	arg->mpi_tag = mpi_tag;
			
 
				 	arg->handle = handle;
			
@@ -186,24 +190,29 @@ static void create_task_11_recv(unsigned k)
 
				 	 * 21(k-1)i with i,j >= k */
			
 
				 	unsigned ndeps = 0;
			
 
				 	starpu_tag_t tag_array[2*nblocks];
			
 
				-	
			
 
				+
			
 
				 #ifdef SINGLE_TMP11
			
 
				-	unsigned i, j;
			
 
				 	if (k > 0)
			
 
				-	for (i = (k-1)+1; i < nblocks; i++)
			
 
				 	{
			
 
				-		if (rank == get_block_rank(i, k-1))
			
 
				-			tag_array[ndeps++] = TAG21(k-1, i);
			
 
				+		unsigned i;
			
 
				+		for (i = (k-1)+1; i < nblocks; i++)
			
 
				+		{
			
 
				+			if (rank == get_block_rank(i, k-1))
			
 
				+				tag_array[ndeps++] = TAG21(k-1, i);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	if (k > 0)
			
 
				-	for (j = (k-1)+1; j < nblocks; j++)
			
 
				 	{
			
 
				-		if (rank == get_block_rank(k-1, j))
			
 
				-			tag_array[ndeps++] = TAG12(k-1, j);
			
 
				+		unsigned j;
			
 
				+		for (j = (k-1)+1; j < nblocks; j++)
			
 
				+		{
			
 
				+			if (rank == get_block_rank(k-1, j))
			
 
				+				tag_array[ndeps++] = TAG12(k-1, j);
			
 
				+		}
			
 
				 	}
			
 
				 #endif
			
 
				-	
			
 
				+
			
 
				 	int source = get_block_rank(k, k);
			
 
				 #ifdef SINGLE_TMP11
			
 
				 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)();
			
@@ -254,7 +263,7 @@ static void callback_task_11_real(void *_arg)
 
				 	starpu_tag_t tag = TAG11_SAVE(k);
			
 
				 	int mpi_tag = MPI_TAG11(k);
			
 
				 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				-	
			
 
				+
			
 
				 	free(arg);
			
 
				 }
			
 
				 
			
@@ -280,10 +289,12 @@ static void create_task_11_real(unsigned k)
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
			
 
				 	}
			
 
				 
			
@@ -296,25 +307,27 @@ static void create_task_11(unsigned k)
 
				 	if (get_block_rank(k, k) == rank)
			
 
				 	{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-		fprintf(stderr, "CREATE real task 11(%d) (TAG11_SAVE(%d) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
			
 
				+		fprintf(stderr, "CREATE real task 11(%u) (TAG11_SAVE(%u) = %llux) on node %d\n", k, k, (unsigned long long) TAG11_SAVE(k), rank);
			
 
				 #endif
			
 
				 		create_task_11_real(k);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				 		int rank_mask[world_size];
			
 
				 		find_nodes_using_11(k, rank_mask);
			
 
				-		
			
 
				+
			
 
				 		if (rank_mask[rank])
			
 
				 		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "create RECV task 11(%d) on node %d\n", k, rank);
			
 
				+			fprintf(stderr, "create RECV task 11(%u) on node %d\n", k, rank);
			
 
				 #endif
			
 
				 			create_task_11_recv(k);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "Node %d needs not 11(%d)\n", rank, k);
			
 
				+			fprintf(stderr, "Node %d needs not 11(%u)\n", rank, k);
			
 
				 #endif
			
 
				 		}
			
 
				 	}
			
@@ -328,8 +341,6 @@ static void create_task_11(unsigned k)
 
				 
			
 
				 static void create_task_12_recv(unsigned k, unsigned j)
			
 
				 {
			
 
				-	unsigned i;
			
 
				-
			
 
				 	/* The current node is not computing that task, so we receive the block
			
 
				 	 * with MPI */
			
 
				 
			
@@ -338,23 +349,32 @@ static void create_task_12_recv(unsigned k, unsigned j)
 
				 	 * i >= k */
			
 
				 	unsigned ndeps = 0;
			
 
				 	starpu_tag_t tag_array[nblocks];
			
 
				-	
			
 
				+
			
 
				+	unsigned start;
			
 
				+	unsigned bound;
			
 
				+
			
 
				 #ifdef SINGLE_TMP1221
			
 
				-	if (k > 0)
			
 
				-	for (i = (k-1)+1; i < nblocks; i++)
			
 
				+	bound = 0;
			
 
				+	start = (k-1)+1;
			
 
				 #else
			
 
				-	if (k > 1)
			
 
				-	for (i = (k-2)+1; i < nblocks; i++)
			
 
				+	bound = 1;
			
 
				+	start = (k-2)+1;
			
 
				 #endif
			
 
				+
			
 
				+	if (k > bound)
			
 
				 	{
			
 
				-		if (rank == get_block_rank(i, j))
			
 
				+		unsigned i;
			
 
				+		for (i = start; i < nblocks; i++)
			
 
				+		{
			
 
				+			if (rank == get_block_rank(i, j))
			
 
				 #ifdef SINGLE_TMP1221
			
 
				-			tag_array[ndeps++] = TAG22(k-1, i, j);
			
 
				+				tag_array[ndeps++] = TAG22(k-1, i, j);
			
 
				 #else
			
 
				-			tag_array[ndeps++] = TAG22(k-2, i, j);
			
 
				+				tag_array[ndeps++] = TAG22(k-2, i, j);
			
 
				 #endif
			
 
				+		}
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	int source = get_block_rank(k, j);
			
 
				 #ifdef SINGLE_TMP1221
			
 
				 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
			
@@ -398,15 +418,17 @@ static void callback_task_12_real(void *_arg)
 
				 	starpu_tag_t tag = TAG12_SAVE(k, j);
			
 
				 	int mpi_tag = MPI_TAG12(k, j);
			
 
				 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				-	
			
 
				+
			
 
				 	free(arg);
			
 
				 }
			
 
				 
			
 
				 static void create_task_12_real(unsigned k, unsigned j)
			
 
				 {
			
 
				 	struct starpu_task *task = create_task(TAG12(k, j));
			
 
				-	
			
 
				+
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning temporary fix :/
			
 
				+#endif
			
 
				 //	task->cl = &STARPU_PLU(cl12);
			
 
				 	task->cl = &STARPU_PLU(cl21);
			
 
				 
			
@@ -414,7 +436,7 @@ static void create_task_12_real(unsigned k, unsigned j)
 
				 
			
 
				 	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
			
 
				 
			
 
				-	starpu_tag_t tag_11_dep; 
			
 
				+	starpu_tag_t tag_11_dep;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				 	starpu_data_handle_t diag_block;
			
@@ -423,7 +445,7 @@ static void create_task_12_real(unsigned k, unsigned j)
 
				 		diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				 		tag_11_dep = TAG11(k);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				 #ifdef SINGLE_TMP11
			
 
				 		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
			
@@ -433,8 +455,8 @@ static void create_task_12_real(unsigned k, unsigned j)
 
				 		tag_11_dep = TAG11_SAVE(k);
			
 
				 	}
			
 
				 
			
 
				-	task->handles[0] = diag_block; 
			
 
				-	task->handles[1] = STARPU_PLU(get_block_handle)(k, j); 
			
 
				+	task->handles[0] = diag_block;
			
 
				+	task->handles[1] = STARPU_PLU(get_block_handle)(k, j);
			
 
				 
			
 
				 	STARPU_ASSERT(get_block_rank(k, j) == rank);
			
 
				 
			
@@ -448,15 +470,18 @@ static void create_task_12_real(unsigned k, unsigned j)
 
				 	task->callback_func = callback_task_12_real;
			
 
				 	task->callback_arg = arg;
			
 
				 
			
 
				-	if (!no_prio && (j == k+1)) {
			
 
				+	if (!no_prio && (j == k+1))
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG12(k, j), 2, tag_11_dep, TAG22(k-1, k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG12(k, j), 1, tag_11_dep);
			
 
				 	}
			
 
				 
			
@@ -469,25 +494,27 @@ static void create_task_12(unsigned k, unsigned j)
 
				 	if (get_block_rank(k, j) == rank)
			
 
				 	{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-		fprintf(stderr, "CREATE real task 12(k = %d, j = %d) on node %d\n", k, j, rank);
			
 
				+		fprintf(stderr, "CREATE real task 12(k = %u, j = %u) on node %d\n", k, j, rank);
			
 
				 #endif
			
 
				 		create_task_12_real(k, j);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				 		int rank_mask[world_size];
			
 
				 		find_nodes_using_12(k, j, rank_mask);
			
 
				-		
			
 
				+
			
 
				 		if (rank_mask[rank])
			
 
				 		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "create RECV task 12(k = %d, j = %d) on node %d\n", k, j, rank);
			
 
				+			fprintf(stderr, "create RECV task 12(k = %u, j = %u) on node %d\n", k, j, rank);
			
 
				 #endif
			
 
				 			create_task_12_recv(k, j);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "Node %d needs not 12(k=%d, i=%d)\n", rank, k, j);
			
 
				+			fprintf(stderr, "Node %d needs not 12(k=%u, i=%u)\n", rank, k, j);
			
 
				 #endif
			
 
				 		}
			
 
				 	}
			
@@ -499,8 +526,6 @@ static void create_task_12(unsigned k, unsigned j)
 
				 
			
 
				 static void create_task_21_recv(unsigned k, unsigned i)
			
 
				 {
			
 
				-	unsigned j;
			
 
				-
			
 
				 	/* The current node is not computing that task, so we receive the block
			
 
				 	 * with MPI */
			
 
				 
			
@@ -509,20 +534,28 @@ static void create_task_21_recv(unsigned k, unsigned i)
 
				 	 * j >= k */
			
 
				 	unsigned ndeps = 0;
			
 
				 	starpu_tag_t tag_array[nblocks];
			
 
				-	
			
 
				+
			
 
				+	unsigned bound;
			
 
				+	unsigned start;
			
 
				+
			
 
				 #ifdef SINGLE_TMP1221
			
 
				-	if (k > 0)
			
 
				-	for (j = (k-1)+1; j < nblocks; j++)
			
 
				+	bound = 0;
			
 
				+	start = (k-1)+1;
			
 
				 #else
			
 
				-	if (k > 1)
			
 
				-	for (j = (k-2)+1; j < nblocks; j++)
			
 
				+	bound = 1;
			
 
				+	start = (k-2)+1;
			
 
				 #endif
			
 
				+	if (k > bound)
			
 
				 	{
			
 
				-		if (rank == get_block_rank(i, j))
			
 
				+		unsigned j;
			
 
				+		for (j = start; j < nblocks; j++)
			
 
				+		{
			
 
				+			if (rank == get_block_rank(i, j))
			
 
				 #ifdef SINGLE_TMP1221
			
 
				-			tag_array[ndeps++] = TAG22(k-1, i, j);
			
 
				+				tag_array[ndeps++] = TAG22(k-1, i, j);
			
 
				 #else
			
 
				-			tag_array[ndeps++] = TAG22(k-2, i, j);
			
 
				+				tag_array[ndeps++] = TAG22(k-2, i, j);
			
 
				+		}
			
 
				 #endif
			
 
				 	}
			
 
				 
			
@@ -570,7 +603,7 @@ static void callback_task_21_real(void *_arg)
 
				 	starpu_tag_t tag = TAG21_SAVE(k, i);
			
 
				 	int mpi_tag = MPI_TAG21(k, i);
			
 
				 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
			
 
				-	
			
 
				+
			
 
				 	free(arg);
			
 
				 }
			
 
				 
			
@@ -578,7 +611,9 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 {
			
 
				 	struct starpu_task *task = create_task(TAG21(k, i));
			
 
				 
			
 
				-#warning temporary fix 
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning temporary fix
			
 
				+#endif
			
 
				 //	task->cl = &STARPU_PLU(cl21);
			
 
				 	task->cl = &STARPU_PLU(cl12);
			
 
				 
			
@@ -586,8 +621,8 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 
			
 
				 	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
			
 
				 
			
 
				-	starpu_tag_t tag_11_dep; 
			
 
				-	
			
 
				+	starpu_tag_t tag_11_dep;
			
 
				+
			
 
				 	/* which sub-data is manipulated ? */
			
 
				 	starpu_data_handle_t diag_block;
			
 
				 	if (diag_block_is_local)
			
@@ -595,7 +630,7 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 		diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				 		tag_11_dep = TAG11(k);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				 #ifdef SINGLE_TMP11
			
 
				 		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
			
@@ -605,7 +640,7 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 		tag_11_dep = TAG11_SAVE(k);
			
 
				 	}
			
 
				 
			
 
				-	task->handles[0] = diag_block; 
			
 
				+	task->handles[0] = diag_block;
			
 
				 	task->handles[1] = STARPU_PLU(get_block_handle)(i, k);
			
 
				 
			
 
				 	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
			
@@ -618,15 +653,18 @@ static void create_task_21_real(unsigned k, unsigned i)
 
				 	task->callback_func = callback_task_21_real;
			
 
				 	task->callback_arg = arg;
			
 
				 
			
 
				-	if (!no_prio && (i == k+1)) {
			
 
				+	if (!no_prio && (i == k+1))
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, i), 2, tag_11_dep, TAG22(k-1, i, k));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, i), 1, tag_11_dep);
			
 
				 	}
			
 
				 
			
@@ -639,25 +677,27 @@ static void create_task_21(unsigned k, unsigned i)
 
				 	if (get_block_rank(i, k) == rank)
			
 
				 	{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-		fprintf(stderr, "CREATE real task 21(k = %d, i = %d) on node %d\n", k, i, rank);
			
 
				+		fprintf(stderr, "CREATE real task 21(k = %u, i = %u) on node %d\n", k, i, rank);
			
 
				 #endif
			
 
				 		create_task_21_real(k, i);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
			
 
				 		int rank_mask[world_size];
			
 
				 		find_nodes_using_21(k, i, rank_mask);
			
 
				-		
			
 
				+
			
 
				 		if (rank_mask[rank])
			
 
				 		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "create RECV task 21(k = %d, i = %d) on node %d\n", k, i, rank);
			
 
				+			fprintf(stderr, "create RECV task 21(k = %u, i = %u) on node %d\n", k, i, rank);
			
 
				 #endif
			
 
				 			create_task_21_recv(k, i);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 #ifdef VERBOSE_INIT
			
 
				-			fprintf(stderr, "Node %d needs not 21(k=%d, i=%d)\n", rank, k,i);
			
 
				+			fprintf(stderr, "Node %d needs not 21(k=%u, i=%u)\n", rank, k,i);
			
 
				 #endif
			
 
				 		}
			
 
				 	}
			
@@ -679,7 +719,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				 
			
 
				-	/* produced by TAG21_SAVE(k, i) */ 
			
 
				+	/* produced by TAG21_SAVE(k, i) */
			
 
				 	unsigned block21_is_local = (get_block_rank(i, k) == rank);
			
 
				 	starpu_tag_t tag_21_dep;
			
 
				 
			
@@ -689,7 +729,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 		block21 = STARPU_PLU(get_block_handle)(i, k);
			
 
				 		tag_21_dep = TAG21(k, i);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				 #ifdef SINGLE_TMP1221
			
 
				 		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
			
@@ -710,7 +750,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 		block12 = STARPU_PLU(get_block_handle)(k, j);
			
 
				 		tag_12_dep = TAG12(k, j);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				 #ifdef SINGLE_TMP1221
			
 
				 		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
			
@@ -722,7 +762,9 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 
			
 
				 
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning temporary fix :/
			
 
				+#endif
			
 
				 	//task->handles[0] = block21;
			
 
				 	task->handles[0] = block12;
			
 
				 
			
@@ -736,15 +778,18 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
				 	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
			
 
				 	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
			
 
				 
			
 
				-	if (!no_prio && (i == k + 1) && (j == k +1) ) {
			
 
				+	if (!no_prio && (i == k + 1) && (j == k +1) )
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), tag_12_dep, tag_21_dep);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 2, tag_12_dep, tag_21_dep);
			
 
				 	}
			
 
				 
			
@@ -759,7 +804,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
				 	//	fprintf(stderr, "CREATE real task 22(k = %d, i = %d, j = %d) on node %d\n", k, i, j, rank);
			
 
				 		create_task_22_real(k, i, j);
			
 
				 	}
			
 
				-//	else {
			
 
				+//	else
			
 
				+//	{
			
 
				 //		fprintf(stderr, "Node %d needs not 22(k=%d, i=%d, j = %d)\n", rank, k,i,j);
			
 
				 //	}
			
 
				 }
			
@@ -787,7 +833,7 @@ static void wait_termination(void)
 
				 			starpu_data_handle_t diag_block = STARPU_PLU(get_block_handle)(k, k);
			
 
				 			wait_tag_and_fetch_handle(TAG11_SAVE(k), diag_block);
			
 
				 		}
			
 
				-		
			
 
				+
			
 
				 
			
 
				 		for (i = k + 1; i < nblocks; i++)
			
 
				 		{
			
@@ -812,11 +858,11 @@ static void wait_termination(void)
 
				 				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
			
 
				 			}
			
 
				 		}
			
 
				-	}	
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	code to bootstrap the factorization 
			
 
				+ *	code to bootstrap the factorization
			
 
				  */
			
 
				 
			
 
				 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
			
@@ -833,6 +879,8 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
				 
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				+		starpu_iteration_push(k);
			
 
				+
			
 
				 		create_task_11(k);
			
 
				 
			
 
				 		for (i = k+1; i<nblocks; i++)
			
@@ -848,6 +896,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
				 				create_task_22(k, i, j);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
@@ -859,12 +908,12 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
				 	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
			
 
				 
			
 
				 	wait_termination();
			
 
				-	
			
 
				+
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	double timing = end - start;
			
 
				-	
			
 
				+
			
 
				 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
			
 
				-	
			
 
				+
			
 
				 	return timing;
			
 
				 }
			
--- a/nmad/examples/mpi_lu/pxlu.h
+++ b/nmad/examples/mpi_lu/pxlu.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2012, 2014, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,7 +31,8 @@
 
				 //#define SINGLE_TMP11	1
			
 
				 //#define SINGLE_TMP1221	1
			
 
				 
			
 
				-struct debug_info {
			
 
				+struct debug_info
			
 
				+{
			
 
				 	unsigned i;
			
 
				 	unsigned j;
			
 
				 	unsigned k;
			
--- a/nmad/examples/mpi_lu/pxlu_implicit.c
+++ b/nmad/examples/mpi_lu/pxlu_implicit.c
@@ -0,0 +1,184 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011, 2013-2015, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2013, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "pxlu.h"
			
 
				+#include "pxlu_kernels.h"
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+//#define VERBOSE_INIT	1
			
 
				+
			
 
				+//#define DEBUG	1
			
 
				+
			
 
				+static unsigned no_prio = 0;
			
 
				+
			
 
				+static unsigned nblocks = 0;
			
 
				+static int rank = -1;
			
 
				+static int world_size = -1;
			
 
				+
			
 
				+struct callback_arg
			
 
				+{
			
 
				+	unsigned i, j, k;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *	Task 11 (diagonal factorization)
			
 
				+ */
			
 
				+
			
 
				+static void create_task_11(unsigned k)
			
 
				+{
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+			       &STARPU_PLU(cl11),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
			
 
				+			       STARPU_PRIORITY, !no_prio ?
			
 
				+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				+			       0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 12 (Update lower left (TRSM))
			
 
				+ */
			
 
				+
			
 
				+static void create_task_12(unsigned k, unsigned j)
			
 
				+{
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning temporary fix 
			
 
				+#endif
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+			       //&STARPU_PLU(cl12),
			
 
				+			       &STARPU_PLU(cl21),
			
 
				+			       STARPU_VALUE, &j, sizeof(j),
			
 
				+			       STARPU_VALUE, &j, sizeof(j),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_R, STARPU_PLU(get_block_handle)(k, k),
			
 
				+			       STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
			
 
				+			       STARPU_PRIORITY, !no_prio && (j == k+1) ?
			
 
				+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				+			       0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 21 (Update upper right (TRSM))
			
 
				+ */
			
 
				+
			
 
				+static void create_task_21(unsigned k, unsigned i)
			
 
				+{
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning temporary fix 
			
 
				+#endif
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+			       //&STARPU_PLU(cl21),
			
 
				+			       &STARPU_PLU(cl12),
			
 
				+			       STARPU_VALUE, &i, sizeof(i),
			
 
				+			       STARPU_VALUE, &i, sizeof(i),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_R, STARPU_PLU(get_block_handle)(k, k),
			
 
				+			       STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
			
 
				+			       STARPU_PRIORITY, !no_prio && (i == k+1) ?
			
 
				+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				+			       0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Task 22 (GEMM)
			
 
				+ */
			
 
				+
			
 
				+static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				+{
			
 
				+	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				+			       &STARPU_PLU(cl22),
			
 
				+			       STARPU_VALUE, &i, sizeof(i),
			
 
				+			       STARPU_VALUE, &j, sizeof(j),
			
 
				+			       STARPU_VALUE, &k, sizeof(k),
			
 
				+			       STARPU_R, STARPU_PLU(get_block_handle)(k, j),
			
 
				+			       STARPU_R, STARPU_PLU(get_block_handle)(i, k),
			
 
				+			       STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
			
 
				+			       STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
			
 
				+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				+			       0);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	code to bootstrap the factorization 
			
 
				+ */
			
 
				+
			
 
				+double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
			
 
				+{
			
 
				+	double start;
			
 
				+	double end;
			
 
				+
			
 
				+	nblocks = _nblocks;
			
 
				+	rank = _rank;
			
 
				+	world_size = _world_size;
			
 
				+
			
 
				+	/* create all the DAG nodes */
			
 
				+	unsigned i,j,k;
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	start = starpu_timing_now();
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		starpu_iteration_push(k);
			
 
				+
			
 
				+		create_task_11(k);
			
 
				+
			
 
				+		for (i = k+1; i<nblocks; i++)
			
 
				+		{
			
 
				+			create_task_12(k, i);
			
 
				+			create_task_21(k, i);
			
 
				+		}
			
 
				+
			
 
				+		starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(k,k));
			
 
				+		if (get_block_rank(k, k) == _rank)
			
 
				+			starpu_data_wont_use(STARPU_PLU(get_block_handle)(k,k));
			
 
				+
			
 
				+		for (i = k+1; i<nblocks; i++)
			
 
				+		{
			
 
				+			for (j = k+1; j<nblocks; j++)
			
 
				+			{
			
 
				+				create_task_22(k, i, j);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		for (i = k+1; i<nblocks; i++)
			
 
				+		{
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(k,i));
			
 
				+			if (get_block_rank(k, i) == _rank)
			
 
				+				starpu_data_wont_use(STARPU_PLU(get_block_handle)(k,i));
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(i,k));
			
 
				+			if (get_block_rank(i, k) == _rank)
			
 
				+				starpu_data_wont_use(STARPU_PLU(get_block_handle)(i,k));
			
 
				+		}
			
 
				+		starpu_iteration_pop();
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	end = starpu_timing_now();
			
 
				+
			
 
				+	double timing = end - start;
			
 
				+	
			
 
				+//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
			
 
				+	
			
 
				+	return timing;
			
 
				+}
			
--- a/nmad/examples/mpi_lu/pxlu_kernels.c
+++ b/nmad/examples/mpi_lu/pxlu_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2012, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -44,8 +44,8 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
				 	struct debug_info *info = _args;
			
 
				 
			
 
				 	int rank;
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d\n", rank, info->k, info->i, info->j);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 22 %d - k = %u i = %u j = %u\n", rank, info->k, info->i, info->j);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -53,7 +53,8 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
				 	cudaError_t cures;
			
 
				 #endif
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			CPU_GEMM("N", "N", dy, dx, dz,
			
 
				 				(TYPE)-1.0, right, ld21, left, ld12,
			
@@ -80,7 +81,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
				 			break;
			
 
				 	}
			
 
				 #ifdef VERBOSE_KERNELS
			
 
				-	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d done\n", rank, info->k, info->i, info->j);
			
 
				+	fprintf(stderr, "KERNEL 22 %d - k = %u i = %u j = %u done\n", rank, info->k, info->i, info->j);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -96,7 +97,8 @@ static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
 
				 }
			
 
				 #endif// STARPU_USE_CUDA
			
 
				 
			
 
				-static struct starpu_perfmodel STARPU_PLU(model_22) = {
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_22) =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
			
@@ -107,7 +109,8 @@ static struct starpu_perfmodel STARPU_PLU(model_22) = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-struct starpu_codelet STARPU_PLU(cl22) = {
			
 
				+struct starpu_codelet STARPU_PLU(cl22) =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {STARPU_PLU(cpu_u22)},
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -142,10 +145,10 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 
				 	struct debug_info *info = _args;
			
 
				 
			
 
				 	int rank;
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 #warning fixed debugging according to other tweak
			
 
				-	//fprintf(stderr, "KERNEL 12 %d - k = %d i %d\n", rank, info->k, info->i);
			
 
				-	fprintf(stderr, "KERNEL 21 %d - k = %d i %d\n", rank, info->k, info->j);
			
 
				+	//fprintf(stderr, "KERNEL 12 %d - k = %u i %u\n", rank, info->k, info->i);
			
 
				+	fprintf(stderr, "KERNEL 21 %d - k = %u i %u\n", rank, info->k, info->j);
			
 
				 
			
 
				 	//fprintf(stderr, "INPUT 12 U11\n");
			
 
				 	fprintf(stderr, "INPUT 21 U11\n");
			
@@ -161,7 +164,8 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 
				 #endif
			
 
				 
			
 
				 	/* solve L11 U12 = A12 (find U12) */
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
			
 
				 					(TYPE)1.0, sub11, ld11, sub12, ld12);
			
@@ -204,7 +208,8 @@ static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				-static struct starpu_perfmodel STARPU_PLU(model_12) = {
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_12) =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
			
@@ -215,7 +220,8 @@ static struct starpu_perfmodel STARPU_PLU(model_12) = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-struct starpu_codelet STARPU_PLU(cl12) = {
			
 
				+struct starpu_codelet STARPU_PLU(cl12) =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {STARPU_PLU(cpu_u12)},
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -250,10 +256,10 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 
				 	struct debug_info *info = _args;
			
 
				 
			
 
				 	int rank;
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 #warning fixed debugging according to other tweak
			
 
				-	//fprintf(stderr, "KERNEL 21 %d (k = %d, i = %d)\n", rank, info->k, info->i);
			
 
				-	fprintf(stderr, "KERNEL 12 %d (k = %d, j = %d)\n", rank, info->k, info->j);
			
 
				+	//fprintf(stderr, "KERNEL 21 %d (k = %u, i = %u)\n", rank, info->k, info->i);
			
 
				+	fprintf(stderr, "KERNEL 12 %d (k = %u, j = %u)\n", rank, info->k, info->j);
			
 
				 
			
 
				 	//fprintf(stderr, "INPUT 21 U11\n");
			
 
				 	fprintf(stderr, "INPUT 12 U11\n");
			
@@ -268,7 +274,8 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 
				 #endif
			
 
				 
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
			
 
				 					(TYPE)1.0, sub11, ld11, sub21, ld21);
			
@@ -313,7 +320,8 @@ static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static struct starpu_perfmodel STARPU_PLU(model_21) = {
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_21) =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
			
@@ -324,7 +332,8 @@ static struct starpu_perfmodel STARPU_PLU(model_21) = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-struct starpu_codelet STARPU_PLU(cl21) = {
			
 
				+struct starpu_codelet STARPU_PLU(cl21) =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {STARPU_PLU(cpu_u21)},
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -356,11 +365,12 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
				 	struct debug_info *info = _args;
			
 
				 
			
 
				 	int rank;
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 11 %d - k = %u\n", rank, info->k);
			
 
				 #endif
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
@@ -403,7 +413,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
				 			break;
			
 
				 	}
			
 
				 #ifdef VERBOSE_KERNELS
			
 
				-	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
			
 
				+	fprintf(stderr, "KERNEL 11 %d - k = %u\n", rank, info->k);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -419,7 +429,8 @@ static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
 
				 }
			
 
				 #endif// STARPU_USE_CUDA
			
 
				 
			
 
				-static struct starpu_perfmodel STARPU_PLU(model_11) = {
			
 
				+static struct starpu_perfmodel STARPU_PLU(model_11) =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
			
@@ -430,7 +441,8 @@ static struct starpu_perfmodel STARPU_PLU(model_11) = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-struct starpu_codelet STARPU_PLU(cl11) = {
			
 
				+struct starpu_codelet STARPU_PLU(cl11) =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {STARPU_PLU(cpu_u11)},
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/nmad/examples/mpi_lu/pxlu_kernels.h
+++ b/nmad/examples/mpi_lu/pxlu_kernels.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2012  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/mpi_lu/slu_kernels.c
+++ b/nmad/examples/mpi_lu/slu_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/native_fortran/nf_basic_ring.f90
+++ b/nmad/examples/native_fortran/nf_basic_ring.f90
@@ -0,0 +1,108 @@
 
				+! StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+!
			
 
				+! Copyright (C) 2016  Inria
			
 
				+!
			
 
				+! StarPU is free software; you can redistribute it and/or modify
			
 
				+! it under the terms of the GNU Lesser General Public License as published by
			
 
				+! the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+! your option) any later version.
			
 
				+!
			
 
				+! StarPU is distributed in the hope that it will be useful, but
			
 
				+! WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+!
			
 
				+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+program nf_basic_ring
			
 
				+        use iso_c_binding       ! C interfacing module
			
 
				+        use fstarpu_mod         ! StarPU interfacing module
			
 
				+        use fstarpu_mpi_mod     ! StarPU-MPI interfacing module
			
 
				+        implicit none
			
 
				+
			
 
				+        integer(c_int) :: ncpu
			
 
				+        integer(c_int) :: ret
			
 
				+        integer(c_int) :: rank,sz
			
 
				+        integer(c_int),target :: token = 42
			
 
				+        integer(c_int) :: nloops = 32
			
 
				+        integer(c_int) :: loop
			
 
				+        integer(c_int) :: tag
			
 
				+        integer(c_int) :: world
			
 
				+        integer(c_int) :: src,dst
			
 
				+        type(c_ptr) :: token_dh, st
			
 
				+
			
 
				+        ret = fstarpu_init(C_NULL_PTR)
			
 
				+        if (ret == -19) then
			
 
				+                stop 77
			
 
				+        else if (ret /= 0) then
			
 
				+                stop 1
			
 
				+        end if
			
 
				+
			
 
				+        ret = fstarpu_mpi_init(1)
			
 
				+        print *,"fstarpu_mpi_init status:", ret
			
 
				+        if (ret /= 0) then
			
 
				+                stop 1
			
 
				+        end if
			
 
				+
			
 
				+        ! stop there if no CPU worker available
			
 
				+        ncpu = fstarpu_cpu_worker_get_count()
			
 
				+        if (ncpu == 0) then
			
 
				+                call fstarpu_shutdown()
			
 
				+                ret = fstarpu_mpi_shutdown()
			
 
				+                stop 77
			
 
				+        end if
			
 
				+
			
 
				+        world = fstarpu_mpi_world_comm()
			
 
				+        rank = fstarpu_mpi_world_rank()
			
 
				+        sz = fstarpu_mpi_world_size()
			
 
				+        write(*,*) "rank=", rank,"size=",sz,"world=",world
			
 
				+        if (sz < 2) then
			
 
				+                call fstarpu_shutdown()
			
 
				+                ret = fstarpu_mpi_shutdown()
			
 
				+                stop 77
			
 
				+        end if
			
 
				+
			
 
				+        call fstarpu_variable_data_register(token_dh, 0, c_loc(token), c_sizeof(token))
			
 
				+
			
 
				+        st = fstarpu_mpi_status_alloc()
			
 
				+        do loop=1,nloops
			
 
				+                tag = loop*sz+rank
			
 
				+                token = 0
			
 
				+                if (loop == 1.and.rank == 0) then
			
 
				+                        write(*,*) "rank=", rank,"token=",token
			
 
				+                else
			
 
				+                        src = modulo((rank+sz-1),sz)
			
 
				+                        write(*,*) "rank=", rank,"recv--> src =", src, "tag =", tag
			
 
				+                        ret = fstarpu_mpi_recv(token_dh, src, tag, world, st)
			
 
				+                        if (ret /= 0) then
			
 
				+                                write(*,*) "fstarpu_mpi_recv failed"
			
 
				+                                stop 1
			
 
				+                        end if
			
 
				+                        write(*,*) "rank=", rank,"recv<--","token=",token
			
 
				+                        token = token+1
			
 
				+                end if
			
 
				+                if (loop == nloops.and.rank == (sz-1)) then
			
 
				+                        call fstarpu_data_acquire(token_dh, FSTARPU_R)
			
 
				+                        write(*,*) "finished: rank=", rank,"token=",token
			
 
				+                        call fstarpu_data_release(token_dh)
			
 
				+                else
			
 
				+                        dst = modulo((rank+1),sz)
			
 
				+                        write(*,*) "rank=", rank,"send--> dst =", dst, "tag =", tag+1
			
 
				+                        ret = fstarpu_mpi_send(token_dh, dst, tag+1, world)
			
 
				+                        if (ret /= 0) then
			
 
				+                                write(*,*) "fstarpu_mpi_recv failed"
			
 
				+                                stop 1
			
 
				+                        end if
			
 
				+                        write(*,*) "rank=", rank,"send<--"
			
 
				+                end if
			
 
				+        end do
			
 
				+        call fstarpu_mpi_status_free(st)
			
 
				+        call fstarpu_data_unregister(token_dh)
			
 
				+        call fstarpu_shutdown()
			
 
				+
			
 
				+        ret = fstarpu_mpi_shutdown()
			
 
				+        print *,"fstarpu_mpi_shutdown status:", ret
			
 
				+        if (ret /= 0) then
			
 
				+                stop 1
			
 
				+        end if
			
 
				+end program nf_basic_ring
			
 
				+
			
--- a/nmad/examples/native_fortran/nf_mm.f90
+++ b/nmad/examples/native_fortran/nf_mm.f90
@@ -0,0 +1,236 @@
 
				+! StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+!
			
 
				+! Copyright (C) 2016  Inria
			
 
				+!
			
 
				+! StarPU is free software; you can redistribute it and/or modify
			
 
				+! it under the terms of the GNU Lesser General Public License as published by
			
 
				+! the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+! your option) any later version.
			
 
				+!
			
 
				+! StarPU is distributed in the hope that it will be useful, but
			
 
				+! WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+!
			
 
				+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+program nf_mm
			
 
				+        use iso_c_binding       ! C interfacing module
			
 
				+        use fstarpu_mod         ! StarPU interfacing module
			
 
				+        use fstarpu_mpi_mod     ! StarPU-MPI interfacing module
			
 
				+        use nf_mm_cl
			
 
				+        implicit none
			
 
				+
			
 
				+        logical, parameter :: verbose = .false.
			
 
				+        integer(c_int) :: comm_rank, comm_size, comm_world
			
 
				+        integer(c_int) :: N = 16, BS = 4, NB
			
 
				+        real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
			
 
				+        type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
			
 
				+        type(c_ptr) :: cl_mm
			
 
				+        integer(c_int) :: ncpu
			
 
				+        integer(c_int) :: ret
			
 
				+        integer(c_int) :: row, col
			
 
				+        integer(c_int) :: b_row, b_col
			
 
				+        integer(c_int) :: mr, tag, rank
			
 
				+
			
 
				+        ret = fstarpu_init(C_NULL_PTR)
			
 
				+        if (ret == -19) then
			
 
				+                stop 77
			
 
				+        else if (ret /= 0) then
			
 
				+                stop 1
			
 
				+        end if
			
 
				+
			
 
				+        ret = fstarpu_mpi_init(1)
			
 
				+        print *,"fstarpu_mpi_init status:", ret
			
 
				+        if (ret /= 0) then
			
 
				+                stop 1
			
 
				+        end if
			
 
				+
			
 
				+        ! stop there if no CPU worker available
			
 
				+        ncpu = fstarpu_cpu_worker_get_count()
			
 
				+        if (ncpu == 0) then
			
 
				+                call fstarpu_shutdown()
			
 
				+                stop 77
			
 
				+        end if
			
 
				+
			
 
				+        comm_world = fstarpu_mpi_world_comm()
			
 
				+        comm_size = fstarpu_mpi_world_size()
			
 
				+        comm_rank = fstarpu_mpi_world_rank()
			
 
				+
			
 
				+        if (comm_size < 2) then
			
 
				+                call fstarpu_shutdown()
			
 
				+                ret = fstarpu_mpi_shutdown()
			
 
				+                stop 77
			
 
				+        end if
			
 
				+
			
 
				+        ! TODO: process app's argc/argv
			
 
				+        NB = N/BS
			
 
				+
			
 
				+        ! allocate and initialize codelet
			
 
				+        cl_mm = fstarpu_codelet_allocate()
			
 
				+        call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
			
 
				+        call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
			
 
				+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
			
 
				+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
			
 
				+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
			
 
				+
			
 
				+        ! allocate matrices
			
 
				+        if (comm_rank == 0) then
			
 
				+                allocate(A(N,N))
			
 
				+                allocate(B(N,N))
			
 
				+                allocate(C(N,N))
			
 
				+        end if
			
 
				+
			
 
				+        ! init matrices
			
 
				+        if (comm_rank == 0) then
			
 
				+                do col=1,N
			
 
				+                do row=1,N
			
 
				+                if (row == col) then
			
 
				+                        A(row,col) = 2
			
 
				+                else
			
 
				+                        A(row,col) = 0
			
 
				+                end if
			
 
				+                B(row,col) = row*N+col
			
 
				+                C(row,col) = 0
			
 
				+                end do
			
 
				+                end do
			
 
				+
			
 
				+                if (verbose) then
			
 
				+                        print *,"A"
			
 
				+                        call mat_disp(A)
			
 
				+                        print *,"B"
			
 
				+                        call mat_disp(B)
			
 
				+                        print *,"C"
			
 
				+                        call mat_disp(C)
			
 
				+                end if
			
 
				+        end if
			
 
				+
			
 
				+        ! allocate data handles
			
 
				+        allocate(dh_A(NB))
			
 
				+        allocate(dh_B(NB))
			
 
				+        allocate(dh_C(NB,NB))
			
 
				+
			
 
				+        ! register matrices
			
 
				+        if (comm_rank == 0) then
			
 
				+                mr = 0 ! TODO: use STARPU_MAIN_RAM constant
			
 
				+        else
			
 
				+                mr = -1
			
 
				+        end if
			
 
				+        tag = 0
			
 
				+
			
 
				+        do b_row=1,NB
			
 
				+                if (comm_rank == 0) then
			
 
				+                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &
			
 
				+                                c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
			
 
				+                else
			
 
				+                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &
			
 
				+                                c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
			
 
				+                end if
			
 
				+                call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
			
 
				+                tag = tag+1
			
 
				+        end do
			
 
				+
			
 
				+        do b_col=1,NB
			
 
				+                if (comm_rank == 0) then
			
 
				+                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &
			
 
				+                                c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
			
 
				+                else
			
 
				+                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &
			
 
				+                                c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
			
 
				+                end if
			
 
				+                call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
			
 
				+                tag = tag+1
			
 
				+        end do
			
 
				+
			
 
				+        do b_col=1,NB
			
 
				+        do b_row=1,NB
			
 
				+                if (comm_rank == 0) then
			
 
				+                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
			
 
				+                                c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
			
 
				+                else
			
 
				+                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
			
 
				+                                c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
			
 
				+                end if
			
 
				+                call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
			
 
				+                tag = tag+1
			
 
				+        end do
			
 
				+        end do
			
 
				+
			
 
				+        ! distribute matrix C
			
 
				+        do b_col=1,NB
			
 
				+        do b_row=1,NB
			
 
				+        rank = modulo(b_row+b_col, comm_size)
			
 
				+        call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), rank)
			
 
				+        end do
			
 
				+        end do
			
 
				+
			
 
				+        do b_col=1,NB
			
 
				+        do b_row=1,NB
			
 
				+                ret = fstarpu_mpi_task_insert(comm_world, (/ cl_mm, &
			
 
				+                        FSTARPU_R,  dh_A(b_row), &
			
 
				+                        FSTARPU_R,  dh_B(b_col), &
			
 
				+                        FSTARPU_RW, dh_C(b_row,b_col), &
			
 
				+                        C_NULL_PTR /))
			
 
				+        end do
			
 
				+        end do
			
 
				+
			
 
				+        call fstarpu_task_wait_for_all()
			
 
				+
			
 
				+        ! undistribute matrix C
			
 
				+        do b_col=1,NB
			
 
				+        do b_row=1,NB
			
 
				+        call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), 0)
			
 
				+        end do
			
 
				+        end do
			
 
				+
			
 
				+        ! unregister matrices
			
 
				+        do b_row=1,NB
			
 
				+                call fstarpu_data_unregister(dh_A(b_row))
			
 
				+        end do
			
 
				+
			
 
				+        do b_col=1,NB
			
 
				+                call fstarpu_data_unregister(dh_B(b_col))
			
 
				+        end do
			
 
				+
			
 
				+        do b_col=1,NB
			
 
				+        do b_row=1,NB
			
 
				+                call fstarpu_data_unregister(dh_C(b_row,b_col))
			
 
				+        end do
			
 
				+        end do
			
 
				+
			
 
				+        ! check result
			
 
				+        if (comm_rank == 0) then
			
 
				+                if (verbose) then
			
 
				+                        print *,"final C"
			
 
				+                        call mat_disp(C)
			
 
				+                end if
			
 
				+
			
 
				+                do col=1,N
			
 
				+                do row=1,N
			
 
				+                if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
			
 
				+                        print *, "check failed"
			
 
				+                        stop 1
			
 
				+                end if
			
 
				+                end do
			
 
				+                end do
			
 
				+        end if
			
 
				+
			
 
				+        ! free handles
			
 
				+        deallocate(dh_A)
			
 
				+        deallocate(dh_B)
			
 
				+        deallocate(dh_C)
			
 
				+
			
 
				+        ! free matrices
			
 
				+        if (comm_rank == 0) then
			
 
				+                deallocate(A)
			
 
				+                deallocate(B)
			
 
				+                deallocate(C)
			
 
				+        end if
			
 
				+        call fstarpu_codelet_free(cl_mm)
			
 
				+        call fstarpu_shutdown()
			
 
				+
			
 
				+        ret = fstarpu_mpi_shutdown()
			
 
				+        print *,"fstarpu_mpi_shutdown status:", ret
			
 
				+        if (ret /= 0) then
			
 
				+                stop 1
			
 
				+        end if
			
 
				+end program nf_mm
			
--- a/nmad/examples/native_fortran/nf_mm_cl.f90
+++ b/nmad/examples/native_fortran/nf_mm_cl.f90
@@ -0,0 +1,90 @@
 
				+! StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+!
			
 
				+! Copyright (C) 2016  Inria
			
 
				+!
			
 
				+! StarPU is free software; you can redistribute it and/or modify
			
 
				+! it under the terms of the GNU Lesser General Public License as published by
			
 
				+! the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+! your option) any later version.
			
 
				+!
			
 
				+! StarPU is distributed in the hope that it will be useful, but
			
 
				+! WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+!
			
 
				+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+module nf_mm_cl
			
 
				+contains
			
 
				+subroutine mat_disp (m)
			
 
				+        ! declared here so it can be used both for the
			
 
				+        ! program and for debugging codelet routines
			
 
				+
			
 
				+        use iso_c_binding       ! C interfacing module
			
 
				+        implicit none
			
 
				+        real(kind=c_double) :: m(:,:)
			
 
				+        integer i,j
			
 
				+
			
 
				+        do i=lbound(m,1),ubound(m,1)
			
 
				+                write(*, fmt="(A2) ",advance="no") "| "
			
 
				+        do j=lbound(m,2),ubound(m,2)
			
 
				+                write(*, fmt="(F6.1,A1) ", advance="no") m(i,j)," "
			
 
				+        end do
			
 
				+                write(*,*) "|"
			
 
				+        end do
			
 
				+        write(*,*)
			
 
				+
			
 
				+end subroutine
			
 
				+
			
 
				+recursive subroutine cl_cpu_mult (buffers, cl_args) bind(C)
			
 
				+        use iso_c_binding       ! C interfacing module
			
 
				+        use fstarpu_mod         ! StarPU interfacing module
			
 
				+        implicit none
			
 
				+
			
 
				+        type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
			
 
				+        real(kind=c_double),pointer :: A(:,:), B(:,:), C(:,:)
			
 
				+        integer :: ld_A,nx_A,ny_A
			
 
				+        integer :: ld_B,nx_B,ny_B
			
 
				+        integer :: ld_C,nx_C,ny_C
			
 
				+        integer :: i,j,k
			
 
				+
			
 
				+        ld_A = fstarpu_matrix_get_ld(buffers, 0)
			
 
				+        ld_B = fstarpu_matrix_get_ld(buffers, 1)
			
 
				+        ld_C = fstarpu_matrix_get_ld(buffers, 2)
			
 
				+
			
 
				+        nx_A = fstarpu_matrix_get_nx(buffers, 0)
			
 
				+        nx_B = fstarpu_matrix_get_nx(buffers, 1)
			
 
				+        nx_C = fstarpu_matrix_get_nx(buffers, 2)
			
 
				+
			
 
				+        ny_A = fstarpu_matrix_get_ny(buffers, 0)
			
 
				+        ny_B = fstarpu_matrix_get_ny(buffers, 1)
			
 
				+        ny_C = fstarpu_matrix_get_ny(buffers, 2)
			
 
				+
			
 
				+        if (ny_C /= ny_B) then
			
 
				+                write(*,*) "C -- B column mismatch"
			
 
				+                stop 1
			
 
				+        end if
			
 
				+
			
 
				+        if (nx_C /= nx_A) then
			
 
				+                write(*,*) "C -- A row mismatch"
			
 
				+                stop 1
			
 
				+        end if
			
 
				+
			
 
				+        if (ny_A /= nx_B) then
			
 
				+                write(*,*) "A -- B col/row mismatch"
			
 
				+                stop 1
			
 
				+        end if
			
 
				+
			
 
				+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 0), A, shape=[ld_A,ny_A])
			
 
				+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 1), B, shape=[ld_B,ny_B])
			
 
				+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 2), C, shape=[ld_C,ny_C])
			
 
				+
			
 
				+        do k = 1, ny_C
			
 
				+        do j = 1, nx_C
			
 
				+        do i = 1, nx_B
			
 
				+                C(j,k) = C(j,k) + A(j,i) * B(i,k)
			
 
				+        end do
			
 
				+        end do
			
 
				+        end do
			
 
				+
			
 
				+end subroutine cl_cpu_mult
			
 
				+end module nf_mm_cl
			
--- a/nmad/examples/perf.sh
+++ b/nmad/examples/perf.sh
@@ -3,7 +3,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 # 
			
 
				 # Copyright (C) 2010  Université de Bordeaux
			
 
				-# Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2010  CNRS
			
 
				 # 
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
--- a/nmad/examples/stencil/stencil5.c
+++ b/nmad/examples/stencil/stencil5.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015              Université Bordeaux
			
 
				- * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2013, 2015-2017              Université Bordeaux
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,7 +20,7 @@
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				 #define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
			
 
				-    						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
			
 
				+    						int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
			
 
				                                                 fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				                                                 fflush(ofile); }} while(0);
			
 
				 
			
@@ -37,15 +37,35 @@ void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
				 //	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
			
 
				 }
			
 
				 
			
 
				+/* Dumb performance model for simgrid */
			
 
				+static double stencil5_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) nimpl;
			
 
				+	return 0.000001;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel stencil5_model =
			
 
				+{
			
 
				+	.type = STARPU_COMMON,
			
 
				+	.cost_function = stencil5_cost_function,
			
 
				+	.symbol = "stencil5"
			
 
				+};
			
 
				+
			
 
				 struct starpu_codelet stencil5_cl =
			
 
				 {
			
 
				 	.cpu_funcs = {stencil5_cpu},
			
 
				 	.nbuffers = 5,
			
 
				-	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
			
 
				+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R},
			
 
				+	.model = &stencil5_model
			
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				-#  define NITER_DEF	100
			
 
				+#  define NITER_DEF	10
			
 
				+#  define X         	2
			
 
				+#  define Y         	2
			
 
				+#elif !defined(STARPU_LONG_CHECK)
			
 
				+#  define NITER_DEF	10
			
 
				 #  define X         	5
			
 
				 #  define Y         	5
			
 
				 #else
			
@@ -96,9 +116,18 @@ int main(int argc, char **argv)
 
				 
			
 
				 	int ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	starpu_mpi_init(&argc, &argv, 1);
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	if (starpu_cpu_worker_get_count() == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "We need at least 1 CPU worker.\n");
			
 
				+		starpu_mpi_shutdown();
			
 
				+		starpu_shutdown();
			
 
				+		return 77;
			
 
				+	}
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
@@ -136,14 +165,14 @@ int main(int argc, char **argv)
 
				 			int mpi_rank = my_distrib(x, y, size);
			
 
				 			if (mpi_rank == my_rank)
			
 
				 			{
			
 
				-				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
			
 
				+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
			
 
				 				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(float));
			
 
				 			}
			
 
				 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				 				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				 			{
			
 
				 				/* I don't own that index, but will need it for my computations */
			
 
				-				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
			
 
				+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
			
 
				 				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
			
 
				 			}
			
 
				 			else
			
@@ -153,6 +182,7 @@ int main(int argc, char **argv)
 
				 			}
			
 
				 			if (data_handles[x][y])
			
 
				 			{
			
 
				+				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
			
 
				 				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
			
 
				 			}
			
 
				 		}
			
@@ -161,18 +191,21 @@ int main(int argc, char **argv)
 
				 	/* First computation with initial distribution */
			
 
				 	for(loop=0 ; loop<niter; loop++)
			
 
				 	{
			
 
				+		starpu_iteration_push(loop);
			
 
				+
			
 
				 		for (x = 1; x < X-1; x++)
			
 
				 		{
			
 
				 			for (y = 1; y < Y-1; y++)
			
 
				 			{
			
 
				-				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
			
 
				 						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
			
 
				 						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
			
 
				 						       0);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				-	fprintf(stderr, "Waiting ...\n");
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	/* Now migrate data to a new distribution */
			
@@ -192,33 +225,31 @@ int main(int argc, char **argv)
 
				 				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
			
 
				 			}
			
 
				 			if (data_handles[x][y] && mpi_rank != starpu_mpi_data_get_rank(data_handles[x][y]))
			
 
				-			{
			
 
				 				/* Migrate the data */
			
 
				-				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
			
 
				-				/* And register new rank of the matrix */
			
 
				-				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
			
 
				-			}
			
 
				+				starpu_mpi_data_migrate(MPI_COMM_WORLD, data_handles[x][y], mpi_rank);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	/* Second computation with new distribution */
			
 
				 	for(loop=0 ; loop<niter; loop++)
			
 
				 	{
			
 
				+		starpu_iteration_push(niter + loop);
			
 
				+
			
 
				 		for (x = 1; x < X-1; x++)
			
 
				 		{
			
 
				 			for (y = 1; y < Y-1; y++)
			
 
				 			{
			
 
				-				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
			
 
				 						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
			
 
				 						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
			
 
				 						       0);
			
 
				 			}
			
 
				 		}
			
 
				+		starpu_iteration_pop();
			
 
				 	}
			
 
				-	fprintf(stderr, "Waiting ...\n");
			
 
				+	FPRINTF(stderr, "Waiting ...\n");
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				-
			
 
				 	/* Unregister data */
			
 
				 	for(x = 0; x < X; x++)
			
 
				 	{
			
@@ -228,9 +259,7 @@ int main(int argc, char **argv)
 
				 			{
			
 
				 				int mpi_rank = my_distrib(x, y, size);
			
 
				 				/* Get back data to original place where the user-provided buffer is. */
			
 
				-				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
			
 
				-				/* Register original rank of the matrix (although useless) */
			
 
				-				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
			
 
				+				starpu_mpi_data_migrate(MPI_COMM_WORLD, data_handles[x][y], mpi_rank);
			
 
				 				/* And unregister it */
			
 
				 				starpu_data_unregister(data_handles[x][y]);
			
 
				 			}
			
@@ -242,15 +271,15 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (display)
			
 
				 	{
			
 
				-		fprintf(stdout, "[%d] mean=%2.2f\n", my_rank, mean);
			
 
				+		FPRINTF(stdout, "[%d] mean=%2.2f\n", my_rank, mean);
			
 
				 		for(x = 0; x < X; x++)
			
 
				 		{
			
 
				-			fprintf(stdout, "[%d] ", my_rank);
			
 
				+			FPRINTF(stdout, "[%d] ", my_rank);
			
 
				 			for (y = 0; y < Y; y++)
			
 
				 			{
			
 
				-				fprintf(stdout, "%2.2f ", matrix[x][y]);
			
 
				+				FPRINTF(stdout, "%2.2f ", matrix[x][y]);
			
 
				 			}
			
 
				-			fprintf(stdout, "\n");
			
 
				+			FPRINTF(stdout, "\n");
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/nmad/examples/user_datatype/my_interface.c
+++ b/nmad/examples/user_datatype/my_interface.c
@@ -0,0 +1,229 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#include "my_interface.h"
			
 
				+
			
 
				+void starpu_my_interface_display_codelet_cpu(void *descr[], void *_args)
			
 
				+{
			
 
				+	char c = STARPU_MY_INTERFACE_GET_CHAR(descr[0]);
			
 
				+	int d = STARPU_MY_INTERFACE_GET_INT(descr[0]);
			
 
				+	char msg[100];
			
 
				+
			
 
				+	if (_args)
			
 
				+		starpu_codelet_unpack_args(_args, &msg);
			
 
				+
			
 
				+	fprintf(stderr, "[%s] My value = '%c' %d\n", _args?msg:NULL, c, d);
			
 
				+}
			
 
				+
			
 
				+void starpu_my_interface_compare_codelet_cpu(void *descr[], void *_args)
			
 
				+{
			
 
				+	int *compare;
			
 
				+
			
 
				+	starpu_codelet_unpack_args(_args, &compare);
			
 
				+
			
 
				+	int d0 = STARPU_MY_INTERFACE_GET_INT(descr[0]);
			
 
				+	char c0 = STARPU_MY_INTERFACE_GET_CHAR(descr[0]);
			
 
				+	int d1 = STARPU_MY_INTERFACE_GET_INT(descr[1]);
			
 
				+	char c1 = STARPU_MY_INTERFACE_GET_CHAR(descr[1]);
			
 
				+
			
 
				+	*compare = (d0 == d1 && c0 == c1);
			
 
				+}
			
 
				+
			
 
				+static struct starpu_my_interface *myinterface = NULL;
			
 
				+
			
 
				+void _starpu_my_interface_datatype_allocate(MPI_Datatype *mpi_datatype)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	int blocklengths[2] = {1, 1};
			
 
				+	MPI_Aint displacements[2];
			
 
				+	MPI_Datatype types[2] = {MPI_INT, MPI_CHAR};
			
 
				+	myinterface = malloc(sizeof(struct starpu_my_interface));
			
 
				+
			
 
				+	MPI_Address(myinterface, displacements);
			
 
				+	MPI_Address(&myinterface[0].c, displacements+1);
			
 
				+	displacements[1] -= displacements[0];
			
 
				+	displacements[0] = 0;
			
 
				+
			
 
				+	ret = MPI_Type_create_struct(2, blocklengths, displacements, types, mpi_datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
			
 
				+
			
 
				+	ret = MPI_Type_commit(mpi_datatype);
			
 
				+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
			
 
				+}
			
 
				+
			
 
				+void starpu_my_interface_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype)
			
 
				+{
			
 
				+	(void)handle;
			
 
				+	_starpu_my_interface_datatype_allocate(mpi_datatype);
			
 
				+}
			
 
				+
			
 
				+void starpu_my_interface_datatype_free(MPI_Datatype *mpi_datatype)
			
 
				+{
			
 
				+	MPI_Type_free(mpi_datatype);
			
 
				+	free(myinterface);
			
 
				+}
			
 
				+
			
 
				+int starpu_my_interface_get_int(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	struct starpu_my_interface *my_interface =
			
 
				+		(struct starpu_my_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	return my_interface->d;
			
 
				+}
			
 
				+
			
 
				+char starpu_my_interface_get_char(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	struct starpu_my_interface *my_interface =
			
 
				+		(struct starpu_my_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	return my_interface->c;
			
 
				+}
			
 
				+
			
 
				+static void data_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
			
 
				+{
			
 
				+	struct starpu_my_interface *my_interface = (struct starpu_my_interface *) data_interface;
			
 
				+
			
 
				+	unsigned node;
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+	{
			
 
				+		struct starpu_my_interface *local_interface = (struct starpu_my_interface *)
			
 
				+			starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				+			local_interface->d = my_interface->d;
			
 
				+			local_interface->c = my_interface->c;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			local_interface->d = 0;
			
 
				+			local_interface->c = 0;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static starpu_ssize_t data_allocate_data_on_node(void *data_interface, unsigned node)
			
 
				+{
			
 
				+	(void)data_interface;
			
 
				+	(void)node;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void data_free_data_on_node(void *data_interface, unsigned node)
			
 
				+{
			
 
				+	(void)data_interface;
			
 
				+	(void)node;
			
 
				+}
			
 
				+
			
 
				+static size_t data_get_size(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	(void)handle;
			
 
				+	return sizeof(int) + sizeof(char);
			
 
				+}
			
 
				+
			
 
				+static uint32_t data_footprint(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	return starpu_hash_crc32c_be(starpu_my_interface_get_int(handle), 0);
			
 
				+}
			
 
				+
			
 
				+static int data_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
			
 
				+{
			
 
				+	(void)handle;
			
 
				+	(void)node;
			
 
				+	(void)ptr;
			
 
				+	(void)count;
			
 
				+	STARPU_ASSERT_MSG(0, "The data interface has been registered with starpu_mpi_datatype_register(). Calling the pack_data function should not happen\n");
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int data_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
			
 
				+{
			
 
				+	(void)handle;
			
 
				+	(void)node;
			
 
				+	(void)ptr;
			
 
				+	(void)count;
			
 
				+	STARPU_ASSERT_MSG(0, "The data interface has been registered with starpu_mpi_datatype_register(). Calling the unpack_data function should not happen\n");
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static starpu_ssize_t data_describe(void *data_interface, char *buf, size_t size)
			
 
				+{
			
 
				+	struct starpu_my_interface *my_interface = (struct starpu_my_interface *) data_interface;
			
 
				+	return snprintf(buf, size, "Data%d-%c", my_interface->d, my_interface->c);
			
 
				+}
			
 
				+
			
 
				+static void *data_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+
			
 
				+	struct starpu_my_interface *my_interface = (struct starpu_my_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+	return (void*) &my_interface->d;
			
 
				+}
			
 
				+
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node,
			
 
				+			   void *dst_interface, unsigned dst_node,
			
 
				+			   void *async_data)
			
 
				+{
			
 
				+	struct starpu_my_interface *src_data = src_interface;
			
 
				+	struct starpu_my_interface *dst_data = dst_interface;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (starpu_interface_copy((uintptr_t) src_data->d, 0, src_node,
			
 
				+				  (uintptr_t) dst_data->d, 0, dst_node,
			
 
				+				  sizeof(src_data->d), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				+	if (starpu_interface_copy((uintptr_t) src_data->c, 0, src_node,
			
 
				+				  (uintptr_t) dst_data->c, 0, dst_node,
			
 
				+				  sizeof(src_data->c),
			
 
				+				  async_data))
			
 
				+		ret = -EAGAIN;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static const struct starpu_data_copy_methods data_copy_methods =
			
 
				+{
			
 
				+	.any_to_any = copy_any_to_any
			
 
				+};
			
 
				+
			
 
				+static struct starpu_data_interface_ops interface_data_ops =
			
 
				+{
			
 
				+	.register_data_handle = data_register_data_handle,
			
 
				+	.allocate_data_on_node = data_allocate_data_on_node,
			
 
				+	.free_data_on_node = data_free_data_on_node,
			
 
				+	.copy_methods = &data_copy_methods,
			
 
				+	.get_size = data_get_size,
			
 
				+	.footprint = data_footprint,
			
 
				+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
			
 
				+	.interface_size = sizeof(struct starpu_my_interface),
			
 
				+	.handle_to_pointer = data_handle_to_pointer,
			
 
				+	.pack_data = data_pack_data,
			
 
				+	.unpack_data = data_unpack_data,
			
 
				+	.describe = data_describe
			
 
				+};
			
 
				+
			
 
				+void starpu_my_interface_data_register(starpu_data_handle_t *handleptr, unsigned home_node, struct starpu_my_interface *xc)
			
 
				+{
			
 
				+	if (interface_data_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
			
 
				+	{
			
 
				+		interface_data_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_register(handleptr, home_node, xc, &interface_data_ops);
			
 
				+}
			
--- a/nmad/examples/user_datatype/my_interface.h
+++ b/nmad/examples/user_datatype/my_interface.h
@@ -0,0 +1,62 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#ifndef __DATA_INTERFACE_H
			
 
				+#define __DATA_INTERFACE_H
			
 
				+
			
 
				+struct starpu_my_interface
			
 
				+{
			
 
				+	int d;
			
 
				+	char c;
			
 
				+};
			
 
				+
			
 
				+void starpu_my_interface_data_register(starpu_data_handle_t *handle, unsigned home_node, struct starpu_my_interface *xc);
			
 
				+
			
 
				+char starpu_my_interface_get_char(starpu_data_handle_t handle);
			
 
				+int starpu_my_interface_get_int(starpu_data_handle_t handle);
			
 
				+
			
 
				+#define STARPU_MY_INTERFACE_GET_CHAR(interface)	(((struct starpu_my_interface *)(interface))->c)
			
 
				+#define STARPU_MY_INTERFACE_GET_INT(interface)	(((struct starpu_my_interface *)(interface))->d)
			
 
				+
			
 
				+void _starpu_my_interface_datatype_allocate(MPI_Datatype *mpi_datatype);
			
 
				+void starpu_my_interface_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype);
			
 
				+void starpu_my_interface_datatype_free(MPI_Datatype *mpi_datatype);
			
 
				+
			
 
				+void starpu_my_interface_display_codelet_cpu(void *descr[], void *_args);
			
 
				+void starpu_my_interface_compare_codelet_cpu(void *descr[], void *_args);
			
 
				+
			
 
				+static struct starpu_codelet starpu_my_interface_display_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {starpu_my_interface_display_codelet_cpu},
			
 
				+	.cpu_funcs_name = {"starpu_my_interface_display_codelet_cpu"},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R},
			
 
				+	.name = "starpu_my_interface_display_codelet"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet starpu_my_interface_compare_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {starpu_my_interface_compare_codelet_cpu},
			
 
				+	.cpu_funcs_name = {"starpu_my_interface_compare_codelet_cpu"},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_R},
			
 
				+	.name = "starpu_my_interface_compare_codelet"
			
 
				+};
			
 
				+
			
 
				+#endif /* __MY_INTERFACE_H */
			
--- a/nmad/examples/user_datatype/user_datatype.c
+++ b/nmad/examples/user_datatype/user_datatype.c
@@ -0,0 +1,113 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_mpi.h>
			
 
				+#include "my_interface.h"
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int rank, nodes;
			
 
				+	int ret=0;
			
 
				+	int compare=0;
			
 
				+
			
 
				+	struct starpu_my_interface my1 = {.d = 98 , .c = 'z'};
			
 
				+	struct starpu_my_interface my0 = {.d = 42 , .c = 'n'};
			
 
				+
			
 
				+	starpu_data_handle_t handle0;
			
 
				+	starpu_data_handle_t handle1;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
			
 
				+
			
 
				+	if (nodes < 2 || (starpu_cpu_worker_get_count() == 0))
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			if (nodes < 2)
			
 
				+				fprintf(stderr, "We need at least 2 processes.\n");
			
 
				+			else
			
 
				+				fprintf(stderr, "We need at least 1 CPU.\n");
			
 
				+		}
			
 
				+		starpu_mpi_shutdown();
			
 
				+		starpu_shutdown();
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 1)
			
 
				+	{
			
 
				+		my0.d = 0;
			
 
				+		my0.c = 'z';
			
 
				+	}
			
 
				+	starpu_my_interface_data_register(&handle0, STARPU_MAIN_RAM, &my0);
			
 
				+	starpu_my_interface_data_register(&handle1, -1, &my1);
			
 
				+	starpu_mpi_datatype_register(handle1, starpu_my_interface_datatype_allocate, starpu_my_interface_datatype_free);
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		MPI_Datatype mpi_datatype;
			
 
				+		_starpu_my_interface_datatype_allocate(&mpi_datatype);
			
 
				+		MPI_Send(&my0, 1, mpi_datatype, 1, 42, MPI_COMM_WORLD);
			
 
				+		starpu_my_interface_datatype_free(&mpi_datatype);
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		MPI_Datatype mpi_datatype;
			
 
				+		MPI_Status status;
			
 
				+		_starpu_my_interface_datatype_allocate(&mpi_datatype);
			
 
				+		MPI_Recv(&my0, 1, mpi_datatype, 0, 42, MPI_COMM_WORLD, &status);
			
 
				+		FPRINTF(stderr, "Received value: '%c' %d\n", my0.c, my0.d);
			
 
				+		starpu_my_interface_datatype_free(&mpi_datatype);
			
 
				+	}
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		int *compare_ptr = &compare;
			
 
				+
			
 
				+		starpu_task_insert(&starpu_my_interface_display_codelet, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle0, 0);
			
 
				+		starpu_mpi_isend_detached(handle0, 1, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				+		starpu_mpi_irecv_detached(handle1, 1, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				+
			
 
				+		starpu_task_insert(&starpu_my_interface_display_codelet, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle1, 0);
			
 
				+		starpu_task_insert(&starpu_my_interface_compare_codelet, STARPU_R, handle0, STARPU_R, handle1, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
			
 
				+	}
			
 
				+	else if (rank == 1)
			
 
				+	{
			
 
				+		starpu_task_insert(&starpu_my_interface_display_codelet, STARPU_VALUE, "node1 initial value", strlen("node1 initial value")+1, STARPU_R, handle0, 0);
			
 
				+		starpu_mpi_irecv_detached(handle0, 0, 10, MPI_COMM_WORLD, NULL, NULL);
			
 
				+		starpu_task_insert(&starpu_my_interface_display_codelet, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle0, 0);
			
 
				+		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				+	}
			
 
				+
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+
			
 
				+	starpu_mpi_datatype_unregister(handle0);
			
 
				+	starpu_data_unregister(handle0);
			
 
				+	starpu_data_unregister(handle1);
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return (rank == 0) ? !compare : 0;
			
 
				+}