浏览代码

nmad: update examples from mpi directory

Nathalie Furmento 7 年之前
父节点
当前提交
7dad72f05a
共有 55 个文件被更改,包括 3456 次插入350 次删除
  1. 228 31
      nmad/examples/Makefile.am
  2. 150 0
      nmad/examples/comm/comm.c
  3. 185 0
      nmad/examples/comm/mix_comm.c
  4. 70 46
      nmad/examples/complex/mpi_complex.c
  5. 27 0
      nmad/examples/helper.h
  6. 26 11
      nmad/examples/matrix_decomposition/mpi_cholesky.c
  7. 31 0
      nmad/examples/matrix_decomposition/mpi_cholesky.h
  8. 41 34
      nmad/examples/matrix_decomposition/mpi_cholesky_codelets.c
  9. 1 1
      nmad/examples/matrix_decomposition/mpi_cholesky_codelets.h
  10. 6 10
      nmad/examples/matrix_decomposition/mpi_cholesky_distributed.c
  11. 2 3
      nmad/examples/matrix_decomposition/mpi_cholesky_kernels.c
  12. 1 1
      nmad/examples/matrix_decomposition/mpi_cholesky_kernels.h
  13. 2 2
      nmad/examples/matrix_decomposition/mpi_cholesky_models.c
  14. 1 3
      nmad/examples/matrix_decomposition/mpi_cholesky_models.h
  15. 8 7
      nmad/examples/matrix_decomposition/mpi_decomposition_matrix.c
  16. 1 1
      nmad/examples/matrix_decomposition/mpi_decomposition_matrix.h
  17. 18 5
      nmad/examples/matrix_decomposition/mpi_decomposition_params.c
  18. 3 3
      nmad/examples/matrix_decomposition/mpi_decomposition_params.h
  19. 30 0
      nmad/examples/matrix_mult/Makefile
  20. 25 0
      nmad/examples/matrix_mult/environment
  21. 390 0
      nmad/examples/matrix_mult/mm.c
  22. 1 1
      nmad/examples/mpi_lu/mpi_lu-double.h
  23. 1 1
      nmad/examples/mpi_lu/mpi_lu-float.h
  24. 1 1
      nmad/examples/mpi_lu/pdlu.c
  25. 19 0
      nmad/examples/mpi_lu/pdlu_implicit.c
  26. 1 1
      nmad/examples/mpi_lu/pdlu_kernels.c
  27. 45 29
      nmad/examples/mpi_lu/plu_example.c
  28. 1 1
      nmad/examples/mpi_lu/plu_example_double.c
  29. 1 1
      nmad/examples/mpi_lu/plu_example_float.c
  30. 369 0
      nmad/examples/mpi_lu/plu_implicit_example.c
  31. 19 0
      nmad/examples/mpi_lu/plu_implicit_example_double.c
  32. 19 0
      nmad/examples/mpi_lu/plu_implicit_example_float.c
  33. 402 0
      nmad/examples/mpi_lu/plu_outofcore_example.c
  34. 19 0
      nmad/examples/mpi_lu/plu_outofcore_example_double.c
  35. 19 0
      nmad/examples/mpi_lu/plu_outofcore_example_float.c
  36. 11 7
      nmad/examples/mpi_lu/plu_solve.c
  37. 1 1
      nmad/examples/mpi_lu/plu_solve_double.c
  38. 1 1
      nmad/examples/mpi_lu/plu_solve_float.c
  39. 1 1
      nmad/examples/mpi_lu/pslu.c
  40. 19 0
      nmad/examples/mpi_lu/pslu_implicit.c
  41. 1 1
      nmad/examples/mpi_lu/pslu_kernels.c
  42. 138 89
      nmad/examples/mpi_lu/pxlu.c
  43. 3 2
      nmad/examples/mpi_lu/pxlu.h
  44. 184 0
      nmad/examples/mpi_lu/pxlu_implicit.c
  45. 37 25
      nmad/examples/mpi_lu/pxlu_kernels.c
  46. 1 1
      nmad/examples/mpi_lu/pxlu_kernels.h
  47. 1 1
      nmad/examples/mpi_lu/slu_kernels.c
  48. 108 0
      nmad/examples/native_fortran/nf_basic_ring.f90
  49. 236 0
      nmad/examples/native_fortran/nf_mm.f90
  50. 90 0
      nmad/examples/native_fortran/nf_mm_cl.f90
  51. 1 1
      nmad/examples/perf.sh
  52. 56 27
      nmad/examples/stencil/stencil5.c
  53. 229 0
      nmad/examples/user_datatype/my_interface.c
  54. 62 0
      nmad/examples/user_datatype/my_interface.h
  55. 113 0
      nmad/examples/user_datatype/user_datatype.c

+ 228 - 31
nmad/examples/Makefile.am

@@ -1,7 +1,8 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2013, 2016  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2013, 2015-2017  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
+# Copyright (C) 2016  Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,59 +17,89 @@
 
 include $(top_srcdir)/starpu.mk
 
+if STARPU_SIMGRID
+STARPU_PERF_MODEL_DIR=$(abs_top_srcdir)/tools/perfmodels/sampling
+STARPU_HOSTNAME=mirage
+MALLOC_PERTURB_=0
+export STARPU_PERF_MODEL_DIR
+export STARPU_HOSTNAME
+export MALLOC_PERTURB_
+endif
+
 CC=$(MPICC)
 CCLD=$(MPICC)
+FC=$(MPIFORT)
+FCLD=$(MPIFORT)
 
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
 loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+if !STARPU_SIMGRID
 LOADER			=	loader
-LOADER_BIN		=	$(abs_top_builddir)/nmad/tests/$(LOADER)
+LOADER_BIN		=	$(abs_top_builddir)/nmad/examples/$(LOADER)
+endif
 loader_SOURCES		=	../../tests/loader.c
 endif
 
+if STARPU_SIMGRID
+MPI			=	$(abs_top_builddir)/tools/starpu_smpirun -np 4 -platform $(abs_top_srcdir)/tools/perfmodels/cluster.xml -hostfile $(abs_top_srcdir)/tools/perfmodels/hostfile
+else
+# we always test on 4 processes, the execution time is not that bigger
+if STARPU_QUICK_CHECK
+MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
+else
+MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
+endif
+endif
+
 if STARPU_HAVE_AM111
-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
-LOG_COMPILER	 	=	$(MPIEXEC) -np 2 $(LOADER_BIN)
+TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+LOG_COMPILER	 	=	$(MPI) $(LOADER_BIN)
 else
-TESTS_ENVIRONMENT 	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPIEXEC) -np 4
+TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 
-if !STARPU_SIMGRID
-if STARPU_MPI_CHECK
+#if STARPU_MPI_CHECK
 TESTS			=	$(starpu_mpi_EXAMPLES)
-endif
-endif
+#endif
 
 check_PROGRAMS = $(LOADER) $(starpu_mpi_EXAMPLES)
 starpu_mpi_EXAMPLES =
 
 BUILT_SOURCES =
 
-CLEANFILES = *.gcno *.gcda *.linkinfo
+CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log *.mod
 
-EXTRA_DIST = 					\
+EXTRA_DIST = 				\
 	mpi_lu/mpi_lu-float.h		\
 	mpi_lu/mpi_lu-double.h		\
 	mpi_lu/plu_example.c		\
+	mpi_lu/plu_implicit_example.c	\
+	mpi_lu/plu_outofcore_example.c	\
 	mpi_lu/plu_solve.c		\
 	mpi_lu/pxlu.h			\
 	mpi_lu/pxlu.c			\
+	mpi_lu/pxlu_implicit.c		\
 	mpi_lu/pxlu_kernels.h		\
 	mpi_lu/pxlu_kernels.c		\
+	matrix_decomposition/mpi_cholesky.h 		\
 	matrix_decomposition/mpi_cholesky_codelets.h 	\
 	matrix_decomposition/mpi_cholesky_kernels.h	\
 	matrix_decomposition/mpi_cholesky_models.h 	\
 	matrix_decomposition/mpi_decomposition_params.h	\
 	matrix_decomposition/mpi_decomposition_matrix.h	\
-	../tests/helper.h
+	user_datatype/my_interface.h			\
+	helper.h
 
 examplebindir = $(libdir)/starpu/mpi
 
 examplebin_PROGRAMS =
 
 if STARPU_USE_CUDA
+if STARPU_COVERITY
+include $(top_srcdir)/starpu-mynvcc.mk
+else
 NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
 
 .cu.cubin:
@@ -78,11 +109,12 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 .cu.o:
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 endif
+endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 ###################
 # Stencil example #
@@ -91,24 +123,27 @@ if BUILD_EXAMPLES
 examplebin_PROGRAMS +=				\
 	stencil/stencil5
 
-stencil_stencil5_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
-
 starpu_mpi_EXAMPLES	+=	\
 	stencil/stencil5
 
+endif
+
 ##################
 # MPI LU example #
 ##################
 
+if BUILD_EXAMPLES
 if !NO_BLAS_LIB
 
 examplebin_PROGRAMS += 			\
 	mpi_lu/plu_example_float	\
-	mpi_lu/plu_example_double
+	mpi_lu/plu_example_double	\
+	mpi_lu/plu_implicit_example_float	\
+	mpi_lu/plu_implicit_example_double	\
+	mpi_lu/plu_outofcore_example_float	\
+	mpi_lu/plu_outofcore_example_double
 
 mpi_lu_plu_example_float_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
@@ -117,10 +152,9 @@ mpi_lu_plu_example_float_SOURCES =	\
 	mpi_lu/plu_solve_float.c	\
 	mpi_lu/pslu_kernels.c		\
 	mpi_lu/pslu.c			\
-	$(top_srcdir)/examples/common/blas.c
+	../../examples/common/blas.c
 
 mpi_lu_plu_example_double_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
@@ -129,13 +163,59 @@ mpi_lu_plu_example_double_SOURCES =	\
 	mpi_lu/plu_solve_double.c  	\
 	mpi_lu/pdlu_kernels.c	    	\
 	mpi_lu/pdlu.c		    	\
-	$(top_srcdir)/examples/common/blas.c
+	../../examples/common/blas.c
+
+mpi_lu_plu_implicit_example_float_LDADD =	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_implicit_example_float_SOURCES =	\
+	mpi_lu/plu_implicit_example_float.c	\
+	mpi_lu/plu_solve_float.c		\
+	mpi_lu/pslu_kernels.c			\
+	mpi_lu/pslu_implicit.c			\
+	../../examples/common/blas.c
+
+mpi_lu_plu_implicit_example_double_LDADD =	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_implicit_example_double_SOURCES =	\
+	mpi_lu/plu_implicit_example_double.c	\
+	mpi_lu/plu_solve_double.c		\
+	mpi_lu/pdlu_kernels.c			\
+	mpi_lu/pdlu_implicit.c			\
+	../../examples/common/blas.c
+
+mpi_lu_plu_outofcore_example_float_LDADD =	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_outofcore_example_float_SOURCES =	\
+	mpi_lu/plu_outofcore_example_float.c	\
+	mpi_lu/plu_solve_float.c		\
+	mpi_lu/pslu_kernels.c			\
+	mpi_lu/pslu_implicit.c			\
+	../../examples/common/blas.c
+
+mpi_lu_plu_outofcore_example_double_LDADD =	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_outofcore_example_double_SOURCES =	\
+	mpi_lu/plu_outofcore_example_double.c	\
+	mpi_lu/plu_solve_double.c		\
+	mpi_lu/pdlu_kernels.c			\
+	mpi_lu/pdlu_implicit.c			\
+	../../examples/common/blas.c
+endif
 endif
 
 ########################
 # MPI Cholesky example #
 ########################
 
+if BUILD_EXAMPLES
 if !NO_BLAS_LIB
 examplebin_PROGRAMS +=		\
 	matrix_decomposition/mpi_cholesky			\
@@ -148,10 +228,9 @@ matrix_decomposition_mpi_cholesky_SOURCES	=		\
 	matrix_decomposition/mpi_cholesky_codelets.c	\
 	matrix_decomposition/mpi_decomposition_params.c	\
 	matrix_decomposition/mpi_decomposition_matrix.c	\
-	$(top_srcdir)/examples/common/blas.c
+	../../examples/common/blas.c
 
 matrix_decomposition_mpi_cholesky_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
 matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
@@ -161,33 +240,151 @@ matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 	matrix_decomposition/mpi_cholesky_codelets.c	\
 	matrix_decomposition/mpi_decomposition_params.c	\
 	matrix_decomposition/mpi_decomposition_matrix.c	\
-	$(top_srcdir)/examples/common/blas.c
+	../../examples/common/blas.c
 
 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
+if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky_distributed
 endif
+endif
+endif
+
+########################
+# MPI Matrix mult example #
+########################
+
+if BUILD_EXAMPLES
+examplebin_PROGRAMS +=		\
+	matrix_mult/mm
+
+matrix_mult_mm_SOURCES	=		\
+	matrix_mult/mm.c
+
+matrix_mult_mm_LDADD =			\
+	-lm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES +=				\
+	matrix_mult/mm
+endif
+endif
+
+##########################################
+# Native Fortran MPI Matrix mult example #
+##########################################
+
+if STARPU_HAVE_MPIFORT
+if BUILD_EXAMPLES
+if !STARPU_SANITIZE
+examplebin_PROGRAMS +=		\
+	native_fortran/nf_mm	\
+	native_fortran/nf_basic_ring
+
+native_fortran_nf_mm_SOURCES	=			\
+	native_fortran/nf_mm_cl.f90			\
+	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90	\
+	$(top_srcdir)/include/fstarpu_mod.f90		\
+	native_fortran/nf_mm.f90
+
+native_fortran_nf_mm_LDADD =					\
+	-lm
+
+native_fortran_nf_basic_ring_SOURCES	=			\
+	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90	\
+	$(top_srcdir)/include/fstarpu_mod.f90		\
+	native_fortran/nf_basic_ring.f90
+
+native_fortran_nf_basic_ring_LDADD =					\
+	-lm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES +=				\
+	native_fortran/nf_mm			\
+	native_fortran/nf_basic_ring
+endif
+endif
+endif
+endif
 
 ###################
 # complex example #
 ###################
 
+if BUILD_EXAMPLES
 examplebin_PROGRAMS +=			\
 	complex/mpi_complex
 
 complex_mpi_complex_SOURCES =		\
 	complex/mpi_complex.c		\
-	$(top_srcdir)/examples/interface/complex_interface.c
-
-complex_mpi_complex_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+	../../examples/interface/complex_interface.c
 
 starpu_mpi_EXAMPLES	+=			\
 	complex/mpi_complex
 endif
 
+#########################
+# user_datatype example #
+#########################
+
+if BUILD_EXAMPLES
+examplebin_PROGRAMS +=				\
+	user_datatype/user_datatype
+
+user_datatype_user_datatype_SOURCES =		\
+	user_datatype/user_datatype.c		\
+	user_datatype/my_interface.c
 
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES	+=			\
+	user_datatype/user_datatype
+endif
+endif
+
+###################
+# comm example #
+###################
+
+if BUILD_EXAMPLES
+examplebin_PROGRAMS +=			\
+	comm/comm			\
+	comm/mix_comm
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES	+=			\
+	comm/comm				\
+	comm/mix_comm
+endif
+endif
+
+if STARPU_HAVE_MPIFORT
+if BUILD_EXAMPLES
+if !STARPU_SANITIZE
+# Native Fortran example
+# - list explicit dependences to control proper module files generation
+# - the overriding rule fully disables the corresponing default rule, thus
+#   the default rule body must be copied entirely
+fstarpu_mod.mod: fstarpu_mod.o
+fstarpu_mpi_mod.mod: fstarpu_mpi_mod.o
+nf_mm_cl.mod: nf_mm_cl.o
+
+fstarpu_mod.o: $(top_srcdir)/include/fstarpu_mod.f90
+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ '$(top_srcdir)/'include/fstarpu_mod.f90
+
+fstarpu_mpi_mod.o: $(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90 fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ '$(top_srcdir)/'mpi/include/fstarpu_mpi_mod.f90
+
+nf_mm_cl.o: $(top_srcdir)/mpi/examples/native_fortran/nf_mm_cl.f90 fstarpu_mpi_mod.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_mm_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_mm_cl.f90
+
+nf_mm.o: $(top_srcdir)/mpi/examples/native_fortran/nf_mm.f90 nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_mm_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_mm.f90' || echo '$(srcdir)/'`native_fortran/nf_mm.f90
+
+nf_basic_ring.o: $(top_srcdir)/mpi/examples/native_fortran/nf_basic_ring.f90 fstarpu_mpi_mod.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_basic_ring_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_basic_ring.f90' || echo '$(srcdir)/'`native_fortran/nf_basic_ring.f90
+endif
+endif
+endif

+ 150 - 0
nmad/examples/comm/comm.c

@@ -0,0 +1,150 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015, 2016, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example splits the whole set of communicators in subgroups,
+ * all communications take place within each subgroups
+ */
+
+#include <starpu_mpi.h>
+#include "../helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *value = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	int rank;
+
+	starpu_codelet_unpack_args(_args, &rank);
+	FPRINTF_MPI(stderr, "Executing codelet with value %d and rank %d\n", *value, rank);
+	STARPU_ASSERT_MSG(*value == rank, "Received value %d is not the expected value %d\n", *value, rank);
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+int main(int argc, char **argv)
+{
+	int size, x=789;
+	int color;
+	MPI_Comm newcomm;
+	int rank, newrank;
+	int ret;
+	starpu_data_handle_t data[2];
+	int thread_support;
+
+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
+	{
+		fprintf(stderr,"MPI_Init_thread failed\n");
+		exit(1);
+	}
+	if (thread_support == MPI_THREAD_FUNNELED)
+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+	if (thread_support < MPI_THREAD_FUNNELED)
+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
+
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+        if (size < 4)
+        {
+		FPRINTF(stderr, "We need at least 4 processes.\n");
+                MPI_Finalize();
+                return STARPU_TEST_SKIPPED;
+        }
+
+	color = rank%2;
+	MPI_Comm_split(MPI_COMM_WORLD, color, rank, &newcomm);
+	MPI_Comm_rank(newcomm, &newrank);
+	FPRINTF(stderr, "[%d][%d] color %d\n", rank, newrank, color);
+
+	if (newrank == 0)
+	{
+		FPRINTF(stderr, "[%d][%d] sending %d\n", rank, newrank, rank);
+		MPI_Send(&rank, 1, MPI_INT, 1, 10, newcomm);
+	}
+	else if (newrank == 1)
+	{
+		MPI_Recv(&x, 1, MPI_INT, 0, 10, newcomm, MPI_STATUS_IGNORE);
+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, x);
+	}
+
+        ret = starpu_init(NULL);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+        ret = starpu_mpi_init_comm(NULL, NULL, 0, newcomm);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (newrank == 0)
+	{
+		starpu_variable_data_register(&data[0], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+		starpu_variable_data_register(&data[1], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+		starpu_mpi_data_register_comm(data[1], 22, 0, newcomm);
+	}
+	else
+		starpu_variable_data_register(&data[0], -1, (uintptr_t)NULL, sizeof(int));
+	starpu_mpi_data_register_comm(data[0], 12, 0, newcomm);
+
+	if (newrank == 0)
+	{
+		starpu_mpi_req req[2];
+		starpu_mpi_issend(data[1], &req[0], 1, 22, newcomm);
+		starpu_mpi_isend(data[0], &req[1], 1, 12, newcomm);
+		starpu_mpi_wait(&req[0], MPI_STATUS_IGNORE);
+		starpu_mpi_wait(&req[1], MPI_STATUS_IGNORE);
+	}
+	else if (newrank == 1)
+	{
+		int *xx;
+
+		starpu_mpi_recv(data[0], 0, 12, newcomm, MPI_STATUS_IGNORE);
+		starpu_data_acquire(data[0], STARPU_RW);
+		xx = (int *)starpu_variable_get_local_ptr(data[0]);
+		starpu_data_release(data[0]);
+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, *xx);
+		STARPU_ASSERT_MSG(x==*xx, "Received value %d is incorrect (should be %d)\n", *xx, x);
+
+		starpu_variable_data_register(&data[1], -1, (uintptr_t)NULL, sizeof(int));
+		starpu_mpi_data_register_comm(data[1], 22, 0, newcomm);
+		starpu_mpi_recv(data[0], 0, 22, newcomm, MPI_STATUS_IGNORE);
+		starpu_data_acquire(data[0], STARPU_RW);
+		xx = (int *)starpu_variable_get_local_ptr(data[0]);
+		starpu_data_release(data[0]);
+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, *xx);
+		STARPU_ASSERT_MSG(x==*xx, "Received value %d is incorrect (should be %d)\n", *xx, x);
+	}
+
+	if (newrank == 0 || newrank == 1)
+	{
+		starpu_mpi_task_insert(newcomm, &mycodelet,
+				       STARPU_RW, data[0],
+				       STARPU_VALUE, &x, sizeof(x),
+				       STARPU_EXECUTE_ON_NODE, 1,
+				       0);
+
+		starpu_task_wait_for_all();
+		starpu_data_unregister(data[0]);
+		starpu_data_unregister(data[1]);
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+	MPI_Comm_free(&newcomm);
+        MPI_Finalize();
+	return 0;
+}

+ 185 - 0
nmad/examples/comm/mix_comm.c

@@ -0,0 +1,185 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example splits the whole set of communicators in subgroups,
+ * communications take place both within each subgroups and MPI_COMM_WORLD.
+ */
+
+#include <starpu_mpi.h>
+#include "../helper.h"
+
+void func_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *value = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	int rank;
+
+	starpu_codelet_unpack_args(_args, &rank);
+	FPRINTF_MPI(stderr, "Executing codelet with value %d and rank %d\n", *value, rank);
+	STARPU_ASSERT_MSG(*value == rank, "Received value %d is not the expected value %d\n", *value, rank);
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+int main(int argc, char **argv)
+{
+	int size, x;
+	int color;
+	MPI_Comm newcomm;
+	int rank, newrank;
+	int ret;
+	starpu_data_handle_t data[3];
+	int value = 90;
+	int thread_support;
+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
+	{
+		fprintf(stderr,"MPI_Init_thread failed\n");
+		exit(1);
+	}
+	if (thread_support == MPI_THREAD_FUNNELED)
+		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
+	if (thread_support < MPI_THREAD_FUNNELED)
+		fprintf(stderr,"Warning: MPI does not have thread support!\n");
+
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+        if (size < 4)
+        {
+		FPRINTF(stderr, "We need at least 4 processes.\n");
+                MPI_Finalize();
+                return STARPU_TEST_SKIPPED;
+        }
+
+	color = rank%2;
+	MPI_Comm_split(MPI_COMM_WORLD, color, rank, &newcomm);
+	MPI_Comm_rank(newcomm, &newrank);
+	FPRINTF(stderr, "[%d][%d] color %d\n", rank, newrank, color);
+
+	if (newrank == 0)
+	{
+		FPRINTF(stderr, "[%d][%d] sending %d\n", rank, newrank, rank);
+		MPI_Send(&rank, 1, MPI_INT, 1, 10, newcomm);
+	}
+	else if (newrank == 1)
+	{
+		MPI_Recv(&x, 1, MPI_INT, 0, 10, newcomm, MPI_STATUS_IGNORE);
+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, x);
+	}
+
+        ret = starpu_init(NULL);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+        ret = starpu_mpi_init(NULL, NULL, 0);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (rank == 0)
+	{
+		starpu_variable_data_register(&data[2], STARPU_MAIN_RAM, (uintptr_t)&value, sizeof(int));
+	}
+	else
+		starpu_variable_data_register(&data[2], -1, (uintptr_t)NULL, sizeof(int));
+	starpu_mpi_data_register_comm(data[2], 44, 0, MPI_COMM_WORLD);
+
+	if (newrank == 0)
+	{
+		starpu_variable_data_register(&data[0], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+		starpu_variable_data_register(&data[1], STARPU_MAIN_RAM, (uintptr_t)&rank, sizeof(int));
+		starpu_mpi_data_register_comm(data[1], 22, 0, newcomm);
+	}
+	else
+		starpu_variable_data_register(&data[0], -1, (uintptr_t)NULL, sizeof(int));
+	starpu_mpi_data_register_comm(data[0], 12, 0, newcomm);
+
+	if (newrank == 0)
+	{
+		starpu_mpi_req req[2];
+		starpu_mpi_issend(data[1], &req[0], 1, 22, newcomm);
+		starpu_mpi_isend(data[0], &req[1], 1, 12, newcomm);
+		starpu_mpi_wait(&req[0], MPI_STATUS_IGNORE);
+		starpu_mpi_wait(&req[1], MPI_STATUS_IGNORE);
+	}
+	else if (newrank == 1)
+	{
+		int *xx;
+
+		starpu_mpi_recv(data[0], 0, 12, newcomm, MPI_STATUS_IGNORE);
+		starpu_data_acquire(data[0], STARPU_RW);
+		xx = (int *)starpu_variable_get_local_ptr(data[0]);
+		starpu_data_release(data[0]);
+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, *xx);
+		STARPU_ASSERT_MSG(x==*xx, "Received value %d is incorrect (should be %d)\n", *xx, x);
+
+		starpu_variable_data_register(&data[1], -1, (uintptr_t)NULL, sizeof(int));
+		starpu_mpi_data_register_comm(data[1], 22, 0, newcomm);
+		starpu_mpi_recv(data[0], 0, 22, newcomm, MPI_STATUS_IGNORE);
+		starpu_data_acquire(data[0], STARPU_RW);
+		xx = (int *)starpu_variable_get_local_ptr(data[0]);
+		starpu_data_release(data[0]);
+		FPRINTF(stderr, "[%d][%d] received %d\n", rank, newrank, *xx);
+		STARPU_ASSERT_MSG(x==*xx, "Received value %d is incorrect (should be %d)\n", *xx, x);
+	}
+
+	if (rank == 0)
+	{
+		starpu_data_acquire(data[2], STARPU_RW);
+		int rvalue = *((int *)starpu_variable_get_local_ptr(data[2]));
+		starpu_data_release(data[2]);
+		FPRINTF_MPI(stderr, "sending value %d to %d and receiving from %d\n", rvalue, 1, size-1);
+		starpu_mpi_send(data[2], 1, 44, MPI_COMM_WORLD);
+		starpu_mpi_recv(data[2], size-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		starpu_data_acquire(data[2], STARPU_RW);
+		int *xx = (int *)starpu_variable_get_local_ptr(data[2]);
+		starpu_data_release(data[2]);
+		FPRINTF_MPI(stderr, "Value back is %d\n", *xx);
+		STARPU_ASSERT_MSG(*xx == rvalue + (2*(size-1)), "Received value %d is incorrect (should be %d)\n", *xx, rvalue + (2*(size-1)));
+	}
+	else
+	{
+		int next = (rank == size-1) ? 0 : rank+1;
+		starpu_mpi_recv(data[2], rank-1, 44, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		starpu_data_acquire(data[2], STARPU_RW);
+		int *xx = (int *)starpu_variable_get_local_ptr(data[2]);
+		FPRINTF_MPI(stderr, "receiving %d from %d and sending %d to %d\n", *xx, rank-1, *xx+2, next);
+		*xx = *xx + 2;
+		starpu_data_release(data[2]);
+		starpu_mpi_send(data[2], next, 44, MPI_COMM_WORLD);
+	}
+
+	if (newrank == 0 || newrank == 1)
+	{
+		starpu_mpi_task_insert(newcomm, &mycodelet,
+				       STARPU_RW, data[0],
+				       STARPU_VALUE, &x, sizeof(x),
+				       STARPU_EXECUTE_ON_NODE, 1,
+				       0);
+
+		starpu_task_wait_for_all();
+		starpu_data_unregister(data[0]);
+		starpu_data_unregister(data[1]);
+	}
+	starpu_data_unregister(data[2]);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+	MPI_Comm_free(&newcomm);
+        MPI_Finalize();
+	return 0;
+}

+ 70 - 46
nmad/examples/complex/mpi_complex.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,85 +18,109 @@
 #include <interface/complex_interface.h>
 #include <interface/complex_codelet.h>
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 void display_foo_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	int *foo = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	fprintf(stderr, "foo = %d\n", *foo);
+	FPRINTF(stderr, "foo = %d\n", *foo);
+}
+
+/* Dumb performance model for simgrid */
+static double display_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
 }
 
+static struct starpu_perfmodel display_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = display_cost_function,
+	.symbol = "display"
+};
+
 struct starpu_codelet foo_display =
 {
 	.cpu_funcs = {display_foo_codelet},
 	.nbuffers = 1,
-	.modes = {STARPU_R}
+	.modes = {STARPU_R},
+	.model = &display_model
 };
 
 int main(int argc, char **argv)
 {
 	int rank, nodes;
 	int ret;
-	int compare;
+	int compare=0;
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
 
-	if (nodes < 2)
+	if (nodes < 2 || (starpu_cpu_worker_get_count() == 0))
 	{
-		fprintf(stderr, "This program needs at least 2 nodes (%d available)\n", nodes);
-		ret = 77;
+		if (rank == 0)
+		{
+			if (nodes < 2)
+				fprintf(stderr, "We need at least 2 processes.\n");
+			else
+				fprintf(stderr, "We need at least 1 CPU.\n");
+		}
+		starpu_mpi_shutdown();
+		starpu_shutdown();
+		return 77;
 	}
-	else
-	{
-		starpu_data_handle_t handle;
-		starpu_data_handle_t handle2;
-
-		double real[2] = {4.0, 2.0};
-		double imaginary[2] = {7.0, 9.0};
 
-		double real2[2] = {14.0, 12.0};
-		double imaginary2[2] = {17.0, 19.0};
+	starpu_data_handle_t handle;
+	starpu_data_handle_t handle2;
 
-		if (rank == 1)
-		{
-			real[0] = 0.0;
-			real[1] = 0.0;
-			imaginary[0] = 0.0;
-			imaginary[1] = 0.0;
-		}
+	double real[2] = {4.0, 2.0};
+	double imaginary[2] = {7.0, 9.0};
 
-		starpu_complex_data_register(&handle, 0, real, imaginary, 2);
-		starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
+	double real2[2] = {14.0, 12.0};
+	double imaginary2[2] = {17.0, 19.0};
 
-		if (rank == 0)
-		{
-			int *compare_ptr = &compare;
+	if (rank == 1)
+	{
+		real[0] = 0.0;
+		real[1] = 0.0;
+		imaginary[0] = 0.0;
+		imaginary[1] = 0.0;
+	}
 
-			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle, 0);
-			starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
-			starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
+	starpu_complex_data_register(&handle, STARPU_MAIN_RAM, real, imaginary, 2);
+	starpu_complex_data_register(&handle2, -1, real2, imaginary2, 2);
 
-			starpu_insert_task(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle2, 0);
-			starpu_insert_task(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
-		}
-		else if (rank == 1)
-		{
-			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
-			starpu_insert_task(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle, 0);
-			starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
-		}
+	if (rank == 0)
+	{
+		int *compare_ptr = &compare;
 
-		starpu_task_wait_for_all();
+		starpu_task_insert(&cl_display, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle, 0);
+		starpu_mpi_isend_detached(handle, 1, 10, MPI_COMM_WORLD, NULL, NULL);
+		starpu_mpi_irecv_detached(handle2, 1, 20, MPI_COMM_WORLD, NULL, NULL);
 
-		starpu_data_unregister(handle);
-		starpu_data_unregister(handle2);
+		starpu_task_insert(&cl_display, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle2, 0);
+		starpu_task_insert(&cl_compare, STARPU_R, handle, STARPU_R, handle2, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
+	}
+	else if (rank == 1)
+	{
+		starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
+		starpu_task_insert(&cl_display, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle, 0);
+		starpu_mpi_isend_detached(handle, 0, 20, MPI_COMM_WORLD, NULL, NULL);
 	}
 
+	starpu_task_wait_for_all();
+
+	starpu_data_unregister(handle);
+	starpu_data_unregister(handle2);
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-	if (rank == 0) return !compare; else return ret;
+	return (rank == 0) ? !compare : 0;
 }

+ 27 - 0
nmad/examples/helper.h

@@ -0,0 +1,27 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012, 2013, 2015  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <errno.h>
+#include <starpu_mpi.h>
+
+#define STARPU_TEST_SKIPPED 77
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+#define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
+    						int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
+                                                fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
+                                                fflush(ofile); }} while(0);
+

+ 26 - 11
nmad/examples/matrix_decomposition/mpi_cholesky.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux
+ * Copyright (C) 2009-2012, 2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,11 +16,8 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <starpu_mpi.h>
-#include "mpi_cholesky_models.h"
-#include "mpi_cholesky_codelets.h"
-#include "mpi_decomposition_matrix.h"
-#include "mpi_decomposition_params.h"
+#include "mpi_cholesky.h"
+#include "helper.h"
 
 int main(int argc, char **argv)
 {
@@ -32,16 +29,30 @@ int main(int argc, char **argv)
 	float ***bmat;
 	int rank, nodes, ret;
 	double timing, flops;
+#ifndef STARPU_SIMGRID
 	int correctness;
+#endif
+
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
 	starpu_cublas_init();
 
+	if (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() == 0)
+	{
+		if (rank == 0)
+		{
+			FPRINTF(stderr, "We need at least 1 CPU or CUDA worker.\n");
+		}
+		starpu_mpi_shutdown();
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
 	parse_args(argc, argv, nodes);
 
 	matrix_init(&bmat, rank, nodes, 1);
@@ -51,20 +62,24 @@ int main(int argc, char **argv)
 
 	starpu_mpi_shutdown();
 
+#ifndef STARPU_SIMGRID
 	matrix_display(bmat, rank);
 
 	dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops);
+#endif
 
 	matrix_free(&bmat, rank, nodes, 1);
 	starpu_cublas_shutdown();
 	starpu_shutdown();
 
+#ifndef STARPU_SIMGRID
 	assert(correctness);
+#endif
 
 	if (rank == 0)
 	{
-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
+		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+		FPRINTF(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
 	}
 
 	return 0;

+ 31 - 0
nmad/examples/matrix_decomposition/mpi_cholesky.h

@@ -0,0 +1,31 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2013, 2015  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MPI_CHOLESKY_H__
+#define __MPI_CHOLESKY_H__
+
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include "mpi_cholesky_codelets.h"
+#include "mpi_cholesky_kernels.h"
+#include "mpi_cholesky_models.h"
+#include "mpi_decomposition_matrix.h"
+#include "mpi_decomposition_params.h"
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+#endif // __MPI_CHOLESKY_H__

+ 41 - 34
nmad/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,14 +15,10 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <starpu_mpi.h>
+#include "mpi_cholesky.h"
 #include <common/blas.h>
-#include "mpi_decomposition_params.h"
-#include "mpi_decomposition_matrix.h"
-#include "mpi_cholesky_models.h"
-#include "mpi_cholesky_codelets.h"
-#include "mpi_cholesky_kernels.h"
 #include <sys/time.h>
+#include <limits.h>
 
 /*
  *	Create the codelets
@@ -33,6 +29,8 @@ static struct starpu_codelet cl11 =
 	.cpu_funcs = {chol_cpu_codelet_update_u11},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u11},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1},
 #endif
 	.nbuffers = 1,
 	.modes = {STARPU_RW},
@@ -44,6 +42,8 @@ static struct starpu_codelet cl21 =
 	.cpu_funcs = {chol_cpu_codelet_update_u21},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u21},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -55,9 +55,11 @@ static struct starpu_codelet cl22 =
 	.cpu_funcs = {chol_cpu_codelet_update_u22},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u22},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1},
 #endif
 	.nbuffers = 3,
-	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.modes = {STARPU_R, STARPU_R, STARPU_RW | STARPU_COMMUTE},
 	.model = &chol_model_22
 };
 
@@ -72,6 +74,8 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_data_handle_t **data_handles;
 	unsigned x,y,i,j,k;
 
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
 	/* create all the DAG nodes */
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
@@ -85,10 +89,12 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 			if (mpi_rank == rank)
 			{
 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-				starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
+				starpu_matrix_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)matA[x][y],
 						ld, size/nblocks, size/nblocks, sizeof(float));
 			}
+#ifdef STARPU_DEVEL
 #warning TODO: make better test to only register what is needed
+#endif
 			else
 			{
 				/* I don't own that index, but will need it for my computations */
@@ -98,6 +104,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 			}
 			if (data_handles[x][y])
 			{
+				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
 				starpu_mpi_data_register(data_handles[x][y], (y*nblocks)+x, mpi_rank);
 			}
 		}
@@ -108,43 +115,43 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
 	for (k = 0; k < nblocks; k++)
 	{
-		int prio = STARPU_DEFAULT_PRIO;
-		if (!noprio) prio = STARPU_MAX_PRIO;
+		starpu_iteration_push(k);
 
-		starpu_mpi_insert_task(MPI_COMM_WORLD, &cl11,
-				STARPU_PRIORITY, prio,
-				STARPU_RW, data_handles[k][k],
-				0);
+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+				       STARPU_RW, data_handles[k][k],
+				       0);
 
 		for (j = k+1; j<nblocks; j++)
 		{
-			prio = STARPU_DEFAULT_PRIO;
-			if (!noprio&& (j == k+1)) prio = STARPU_MAX_PRIO;
-			starpu_mpi_insert_task(MPI_COMM_WORLD, &cl21,
-					STARPU_PRIORITY, prio,
-					STARPU_R, data_handles[k][k],
-					STARPU_RW, data_handles[k][j],
-					0);
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[k][j],
+					       0);
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
+			if (my_distrib(k, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[k][k]);
 
 			for (i = k+1; i<nblocks; i++)
 			{
 				if (i <= j)
 				{
-					prio = STARPU_DEFAULT_PRIO;
-					if (!noprio && (i == k + 1) && (j == k +1) ) prio = STARPU_MAX_PRIO;
-					starpu_mpi_insert_task(MPI_COMM_WORLD, &cl22,
-							STARPU_PRIORITY, prio,
-							STARPU_R, data_handles[k][i],
-							STARPU_R, data_handles[k][j],
-							STARPU_RW, data_handles[i][j],
-							0);
+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							       STARPU_R, data_handles[k][i],
+							       STARPU_R, data_handles[k][j],
+							       STARPU_RW | STARPU_COMMUTE, data_handles[i][j],
+							       0);
 				}
 			}
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][j]);
+			if (my_distrib(k, j, nodes) == rank)
+				starpu_data_wont_use(data_handles[k][j]);
 		}
+		starpu_iteration_pop();
 	}
 
 	starpu_task_wait_for_all();
@@ -189,7 +196,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 		}
 	}
 
-	fprintf(stderr, "[%d] compute explicit LLt ...\n", rank);
+	FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
@@ -206,7 +213,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	STARPU_SSYRK("L", "N", size, size, 1.0f,
 			rmat, size, 0.0f, test_mat, size);
 
-	fprintf(stderr, "[%d] comparing results ...\n", rank);
+	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
 	if (display)
 	{
 		for (j = 0; j < size; j++)
@@ -244,7 +251,7 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 							float err = abs(test_mat[j +i*size] - orig);
 							if (err > 0.00001)
 							{
-								fprintf(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
 								*correctness = 0;
 								*flops = 0;
 								break;

+ 1 - 1
nmad/examples/matrix_decomposition/mpi_cholesky_codelets.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 6 - 10
nmad/examples/matrix_decomposition/mpi_cholesky_distributed.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2011  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,11 +16,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <starpu_mpi.h>
-#include "mpi_cholesky_models.h"
-#include "mpi_cholesky_codelets.h"
-#include "mpi_decomposition_matrix.h"
-#include "mpi_decomposition_params.h"
+#include "mpi_cholesky.h"
 
 int main(int argc, char **argv)
 {
@@ -38,8 +34,8 @@ int main(int argc, char **argv)
 
 	ret = starpu_mpi_init(&argc, &argv, 1);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
 	starpu_cublas_init();
 
 	parse_args(argc, argv, nodes);
@@ -56,8 +52,8 @@ int main(int argc, char **argv)
 
 	if (rank == 0)
 	{
-		fprintf(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
-		fprintf(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
+		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);
+		FPRINTF(stdout, "Synthetic GFlops : %2.2f\n", (flops/timing/1000.0f));
 	}
 
 	return 0;

+ 2 - 3
nmad/examples/matrix_decomposition/mpi_cholesky_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2012-2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,9 +15,8 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <starpu.h>
+#include "mpi_cholesky.h"
 #include <math.h>
-#include "mpi_decomposition_params.h"
 #include "common/blas.h"
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>

+ 1 - 1
nmad/examples/matrix_decomposition/mpi_cholesky_kernels.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 2 - 2
nmad/examples/matrix_decomposition/mpi_cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2015  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "mpi_cholesky_models.h"
+#include "mpi_cholesky.h"
 
 /*
  *	Number of flops of Gemm

+ 1 - 3
nmad/examples/matrix_decomposition/mpi_cholesky_models.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013, 2015  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,8 +18,6 @@
 #ifndef __DW_CHOLESKY_MODELS_H__
 #define __DW_CHOLESKY_MODELS_H__
 
-#include <starpu.h>
-
 extern struct starpu_perfmodel chol_model_11;
 extern struct starpu_perfmodel chol_model_21;
 extern struct starpu_perfmodel chol_model_22;

+ 8 - 7
nmad/examples/matrix_decomposition/mpi_decomposition_matrix.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2012, 2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,10 +16,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <starpu.h>
-#include "mpi_decomposition_matrix.h"
-#include "mpi_decomposition_params.h"
-#include "mpi_cholesky_codelets.h"
+#include "mpi_cholesky.h"
 
 /* Returns the MPI node number where data indexes index is */
 int my_distrib(int x, int y, int nb_nodes)
@@ -31,19 +28,21 @@ int my_distrib(int x, int y, int nb_nodes)
 
 void matrix_display(float ***bmat, int rank)
 {
-	unsigned i,j,x,y;
-
 	if (display)
 	{
+		unsigned y;
 		printf("[%d] Input :\n", rank);
 
 		for(y=0 ; y<nblocks ; y++)
 		{
+			unsigned x;
 			for(x=0 ; x<nblocks ; x++)
 			{
+				unsigned j;
 				printf("Block %u,%u :\n", x, y);
 				for (j = 0; j < BLOCKSIZE; j++)
 				{
+					unsigned i;
 					for (i = 0; i < BLOCKSIZE; i++)
 					{
 						if (i <= j)
@@ -80,8 +79,10 @@ void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 				{
 					for (j = 0; j < BLOCKSIZE; j++)
 					{
+#ifndef STARPU_SIMGRID
 						(*bmat)[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
 						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+#endif
 					}
 				}
 			}

+ 1 - 1
nmad/examples/matrix_decomposition/mpi_decomposition_matrix.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2012  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 18 - 5
nmad/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010, 2015-2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,14 +15,25 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include "mpi_cholesky.h"
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 
-unsigned size = 4*960;
+#ifdef STARPU_QUICK_CHECK
+unsigned size = 4*64;
+unsigned nblocks = 2;
+unsigned nbigblocks = 2;
+#elif !defined(STARPU_LONG_CHECK)
+unsigned size = 4*320;
+unsigned nblocks = 4;
+unsigned nbigblocks = 2;
+#else
+unsigned size = 16*320;
 unsigned nblocks = 16;
 unsigned nbigblocks = 2;
+#endif
 unsigned noprio = 0;
 unsigned display = 0;
 int dblockx = -1;
@@ -73,13 +84,14 @@ void parse_args(int argc, char **argv, int nodes)
                         display = 1;
                 }
 
-                if (strcmp(argv[i], "-h") == 0)
+                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 {
                         printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
                 }
         }
 
-        if (nblocks > size) nblocks = size;
+        if (nblocks > size)
+		nblocks = size;
 
 	if (dblockx == -1 || dblocky == -1)
 	{
@@ -96,5 +108,6 @@ void parse_args(int argc, char **argv, int nodes)
 			}
 		}
 	}
+	FPRINTF(stdout, "size: %u - nblocks: %u - dblocksx: %d - dblocksy: %d\n", size, nblocks, dblockx, dblocky);
 }
 

+ 3 - 3
nmad/examples/matrix_decomposition/mpi_decomposition_params.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,8 +25,8 @@ extern unsigned nblocks;
 extern unsigned nbigblocks;
 extern unsigned noprio;
 extern unsigned display;
-extern unsigned dblockx;
-extern unsigned dblocky;
+extern int dblockx;
+extern int dblocky;
 
 void parse_args(int argc, char **argv, int nodes);
 

+ 30 - 0
nmad/examples/matrix_mult/Makefile

@@ -0,0 +1,30 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016  Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+
+# This makefile gives an example on how to build the testcase outside StarPU
+
+PRG	= mm
+
+CC	= mpicc
+CFLAGS	= $(shell pkg-config --cflags starpumpi-1.3) -g -Wall
+LDFLAGS	= $(shell pkg-config --libs starpumpi-1.3) -lm
+
+.phony: all clean
+
+all: $(PRG)
+
+clean:
+	rm -f $(PRG) *.o starpu*.log

+ 25 - 0
nmad/examples/matrix_mult/environment

@@ -0,0 +1,25 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016  Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+
+# This script gives an example on how to set environment variables to build and run the testcase outside StarPU
+
+STARPU_INSTALL_DIR=/usr # set this to StarPU's installation directory
+
+PATH=$STARPU_INSTALL_DIR/bin:$PATH
+PKG_CONFIG_PATH=$STARPU_INSTALL_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
+LD_LIBRARY_PATH=$STARPU_INSTALL_DIR/lib:$LD_LIBRARY_PATH
+
+export PATH PKG_CONFIG_PATH LD_LIBRARY_PATH

+ 390 - 0
nmad/examples/matrix_mult/mm.c

@@ -0,0 +1,390 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example illustrates how to distribute a pre-existing data structure to
+ * a set of computing nodes using StarPU-MPI routines.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define VERBOSE 0
+
+static int N  = 16; /* Matrix size */
+static int BS =  4; /* Block size */
+
+#define NB ((N)/(BS)) /* Number of blocks */
+
+/* Matrices. Will be allocated as regular, linearized C arrays */
+static double *A = NULL; /* A will be partitioned as BS rows x N  cols blocks */
+static double *B = NULL; /* B will be partitioned as N  rows x BS cols blocks */
+static double *C = NULL; /* C will be partitioned as BS rows x BS cols blocks */
+
+/* Arrays of data handles for managing matrix blocks */
+static starpu_data_handle_t *A_h;
+static starpu_data_handle_t *B_h;
+static starpu_data_handle_t *C_h;
+
+static int comm_rank; /* mpi rank of the process */
+static int comm_size; /* size of the mpi session */
+
+static void alloc_matrices(void)
+{
+	/* Regular 'malloc' can also be used instead, however, starpu_malloc make sure that
+	 * the area is allocated in suitably pinned memory to improve data transfers, especially
+	 * with CUDA */
+	starpu_malloc((void **)&A, N*N*sizeof(double));
+	starpu_malloc((void **)&B, N*N*sizeof(double));
+	starpu_malloc((void **)&C, N*N*sizeof(double));
+}
+
+static void free_matrices(void)
+{
+	starpu_free(A);
+	starpu_free(B);
+	starpu_free(C);
+}
+
+static void init_matrices(void)
+{
+	int row,col;
+	for (row = 0; row < N; row++)
+	{
+		for (col = 0; col < N; col++)
+		{
+			A[row*N+col] = (row==col)?2:0;
+			B[row*N+col] = row*N+col;
+			C[row*N+col] = 0;
+		}
+	}
+}
+
+#if VERBOSE
+static void disp_matrix(double *m)
+{
+	int row,col;
+	for (row = 0; row < N; row++)
+	{
+		for (col = 0; col < N; col++)
+		{
+			printf("\t%.2lf", m[row*N+col]);
+		}
+		printf("\n");
+	}
+}
+#endif
+
+static void check_result(void)
+{
+	int row,col;
+	for (row = 0; row < N; row++)
+	{
+		for (col = 0; col < N; col++)
+		{
+			if (fabs(C[row*N+col] - 2*(row*N+col)) > 1.0)
+			{
+				fprintf(stderr, "check failed\n");
+				exit(1);
+			}
+		}
+	}
+#if VERBOSE
+	printf("success\n");
+#endif
+}
+
+
+/* Register the matrix blocks to StarPU and to StarPU-MPI */
+static void register_matrices()
+{
+	A_h = calloc(NB, sizeof(starpu_data_handle_t));
+	B_h = calloc(NB, sizeof(starpu_data_handle_t));
+	C_h = calloc(NB*NB, sizeof(starpu_data_handle_t));
+
+	/* Memory region, where the data being registered resides.
+	 * In this example, all blocks are allocated by node 0, thus
+	 * - node 0 specifies STARPU_MAIN_RAM to indicate that it owns the block in its main memory
+	 * - nodes !0 specify -1 to indicate that they don't have a copy of the block initially
+	 */
+	int mr = (comm_rank == 0) ? STARPU_MAIN_RAM : -1;
+
+	/* mpi tag used for the block */
+	int tag = 0;
+
+	int b_row,b_col;
+
+	for (b_row = 0; b_row < NB; b_row++)
+	{
+		/* Register a block to StarPU */
+		starpu_matrix_data_register(&A_h[b_row],
+				mr,
+				(comm_rank == 0)?(uintptr_t)(A+b_row*BS*N):0, N, N, BS,
+				sizeof(double));
+
+		/* Register a block to StarPU-MPI, specifying the mpi tag to use for transfering the block
+		 * and the rank of the owner node.
+		 *
+		 * Note: StarPU-MPI is an autonomous layer built on top of StarPU, hence the two separate
+		 * registration steps.
+		 */
+		starpu_data_set_coordinates(A_h[b_row], 2, 0, b_row);
+		starpu_mpi_data_register(A_h[b_row], tag++, 0);
+	}
+
+	for (b_col = 0; b_col < NB; b_col++)
+	{
+		starpu_matrix_data_register(&B_h[b_col],
+				mr,
+				(comm_rank == 0)?(uintptr_t)(B+b_col*BS):0, N, BS, N,
+				sizeof(double));
+		starpu_data_set_coordinates(B_h[b_col], 2, b_col, 0);
+		starpu_mpi_data_register(B_h[b_col], tag++, 0);
+	}
+
+	for (b_row = 0; b_row < NB; b_row++)
+	{
+		for (b_col = 0; b_col < NB; b_col++)
+		{
+			starpu_matrix_data_register(&C_h[b_row*NB+b_col],
+					mr,
+					(comm_rank == 0)?(uintptr_t)(C+b_row*BS*N+b_col*BS):0, N, BS, BS,
+					sizeof(double));
+			starpu_data_set_coordinates(C_h[b_row*NB+b_col], 2, b_col, b_row);
+			starpu_mpi_data_register(C_h[b_row*NB+b_col], tag++, 0);
+		}
+	}
+}
+
+/* Transfer ownership of the C matrix blocks following some user-defined distribution over the nodes.
+ * Note: since C will be Write-accessed, it will implicitly define which node perform the task
+ * associated to a given block. */
+static void distribute_matrix_C(void)
+{
+	int b_row,b_col;
+	for (b_row = 0; b_row < NB; b_row++)
+	{
+		for (b_col = 0; b_col < NB; b_col++)
+		{
+			starpu_data_handle_t h = C_h[b_row*NB+b_col]; 
+
+			/* Select the node where the block should be computed. */
+			int target_rank = (b_row+b_col)%comm_size;
+
+			/* Move the block on to its new owner. */
+			starpu_mpi_data_migrate(MPI_COMM_WORLD, h, target_rank);
+		}
+	}
+}
+
+/* Transfer ownership of the C matrix blocks back to node 0, for display purpose. This is not mandatory. */
+static void undistribute_matrix_C(void)
+{
+	int b_row,b_col;
+	for (b_row = 0; b_row < NB; b_row++)
+	{
+		for (b_col = 0; b_col < NB; b_col++)
+		{
+			starpu_data_handle_t h = C_h[b_row*NB+b_col]; 
+			starpu_mpi_data_migrate(MPI_COMM_WORLD, h, 0);
+		}
+	}
+}
+
+/* Unregister matrices from the StarPU management. */
+static void unregister_matrices()
+{
+	int b_row,b_col;
+
+	for (b_row = 0; b_row < NB; b_row++)
+	{
+		starpu_data_unregister(A_h[b_row]);
+	}
+
+	for (b_col = 0; b_col < NB; b_col++)
+	{
+		starpu_data_unregister(B_h[b_col]);
+	}
+
+	for (b_row = 0; b_row < NB; b_row++)
+	{
+		for (b_col = 0; b_col < NB; b_col++)
+		{
+			starpu_data_unregister(C_h[b_row*NB+b_col]);
+		}
+	}
+
+	free(A_h);
+	free(B_h);
+	free(C_h);
+}
+
+/* Perform the actual computation. In a real-life case, this would rather call a BLAS 'gemm' routine
+ * instead. */
+static void cpu_mult(void *handles[], STARPU_ATTRIBUTE_UNUSED void *arg)
+{
+	double *block_A = (double *)STARPU_MATRIX_GET_PTR(handles[0]);
+	double *block_B = (double *)STARPU_MATRIX_GET_PTR(handles[1]);
+	double *block_C = (double *)STARPU_MATRIX_GET_PTR(handles[2]);
+
+	unsigned n_col_A = STARPU_MATRIX_GET_NX(handles[0]);
+	unsigned n_col_B = STARPU_MATRIX_GET_NX(handles[1]);
+	unsigned n_col_C = STARPU_MATRIX_GET_NX(handles[2]);
+
+	unsigned n_row_A = STARPU_MATRIX_GET_NY(handles[0]);
+	unsigned n_row_B = STARPU_MATRIX_GET_NY(handles[1]);
+	unsigned n_row_C = STARPU_MATRIX_GET_NY(handles[2]);
+
+	unsigned ld_A = STARPU_MATRIX_GET_LD(handles[0]);
+	unsigned ld_B = STARPU_MATRIX_GET_LD(handles[1]);
+	unsigned ld_C = STARPU_MATRIX_GET_LD(handles[2]);
+
+	/* Sanity check, not needed in real life case */
+	assert(n_col_C == n_col_B);
+	assert(n_row_C == n_row_A);
+	assert(n_col_A == n_row_B);
+
+	unsigned i,j,k;
+	for (k = 0; k < n_row_C; k++)
+	{
+		for (j = 0; j < n_col_C; j++)
+		{
+			for (i = 0; i < n_col_A; i++)
+			{
+				block_C[k*ld_C+j] += block_A[k*ld_A+i] * block_B[i*ld_B+j]; 
+			}
+
+#if VERBOSE
+			/* For illustration purpose, shows which node computed
+			 * the block in the decimal part of the cell */
+			block_C[k*ld_C+j] += comm_rank / 100.0;
+#endif
+		}
+	}
+}
+
+/* Define a StarPU 'codelet' structure for the matrix multiply kernel above.
+ * This structure enable specifying multiple implementations for the kernel (such as CUDA or OpenCL versions)
+ */
+static struct starpu_codelet gemm_cl =
+{
+	.cpu_funcs = {cpu_mult}, /* cpu implementation(s) of the routine */
+	.nbuffers = 3, /* number of data handles referenced by this routine */
+	.modes = {STARPU_R, STARPU_R, STARPU_RW} /* access modes for each data handle */
+};
+
+int main(int argc, char *argv[])
+{
+	/* Initializes the StarPU core */
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Initializes the StarPU-MPI layer */
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	if (starpu_cpu_worker_get_count() == 0)
+	{
+		FPRINTF(stderr, "We need at least 1 CPU worker.\n");
+		starpu_mpi_shutdown();
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	/* Parse the matrix size and block size optional args */
+	if (argc > 1)
+	{
+		N = atoi(argv[1]);
+		if (N < 1)
+		{
+			fprintf(stderr, "invalid matrix size\n");
+			exit(1);
+		}
+		if (argc > 2)
+		{
+			BS = atoi(argv[2]);
+		}
+		if (BS < 1 || N % BS != 0)
+		{
+			fprintf(stderr, "invalid block size\n");
+			exit(1);
+		}
+	}
+
+	/* Get the process rank and session size */
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
+
+	if (comm_rank == 0)
+	{
+#if VERBOSE
+		printf("N = %d\n", N);
+		printf("BS = %d\n", BS);
+		printf("NB = %d\n", NB);
+		printf("comm_size = %d\n", comm_size);
+#endif
+		/* In this example, node rank 0 performs all the memory allocations and initializations,
+		 * and the blocks are later distributed on the other nodes.
+		 * This is not mandatory however, and blocks could be allocated on other nodes right
+		 * from the beginning, depending on the application needs (in particular for the case
+		 * where the session wide data footprint is larger than a single node available memory. */
+		alloc_matrices();
+		init_matrices();
+	}
+
+	/* Register matrices to StarPU and StarPU-MPI */
+	register_matrices();
+	/* Distribute C blocks */
+	distribute_matrix_C();
+
+	int b_row,b_col;
+
+	for (b_row = 0; b_row < NB; b_row++)
+	{
+		for (b_col = 0; b_col < NB; b_col++)
+		{
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
+					STARPU_R,  A_h[b_row],
+					STARPU_R,  B_h[b_col],
+					STARPU_RW, C_h[b_row*NB+b_col],
+					0);
+		}
+	}
+
+	starpu_task_wait_for_all();
+
+	undistribute_matrix_C();
+	unregister_matrices();
+
+	if (comm_rank == 0)
+	{
+#if VERBOSE
+		disp_matrix(C);
+#endif
+		check_result();
+		free_matrices();
+	}
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+	return 0;
+}
+

+ 1 - 1
nmad/examples/mpi_lu/mpi_lu-double.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
nmad/examples/mpi_lu/mpi_lu-float.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
nmad/examples/mpi_lu/pdlu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 19 - 0
nmad/examples/mpi_lu/pdlu_implicit.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux
+ * Copyright (C) 2010  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "pxlu_implicit.c"

+ 1 - 1
nmad/examples/mpi_lu/pdlu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 45 - 29
nmad/examples/mpi_lu/plu_example.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011, 2013, 2015, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -67,31 +67,35 @@ static starpu_data_handle_t *(tmp_21_block_handles[2]);
 static TYPE **(tmp_21_block[2]);
 #endif
 
-int get_block_rank(unsigned i, unsigned j);
-
 static void parse_args(int rank, int argc, char **argv)
 {
 	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-size") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-size") == 0)
+		{
 			char *argptr;
 			size = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-nblocks") == 0) {
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
 			char *argptr;
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-check") == 0) {
+		if (strcmp(argv[i], "-check") == 0)
+		{
 			check = 1;
 		}
 
-		if (strcmp(argv[i], "-display") == 0) {
+		if (strcmp(argv[i], "-display") == 0)
+		{
 			display = 1;
 		}
 
-		if (strcmp(argv[i], "-numa") == 0) {
+		if (strcmp(argv[i], "-numa") == 0)
+		{
 #ifdef STARPU_HAVE_LIBNUMA
 			numa = 1;
 #else
@@ -100,17 +104,20 @@ static void parse_args(int rank, int argc, char **argv)
 #endif
 		}
 
-		if (strcmp(argv[i], "-p") == 0) {
+		if (strcmp(argv[i], "-p") == 0)
+		{
 			char *argptr;
 			p = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-q") == 0) {
+		if (strcmp(argv[i], "-q") == 0)
+		{
 			char *argptr;
 			q = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
+		{
 			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
 			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
 			exit(0);
@@ -248,11 +255,13 @@ static void init_matrix(int rank)
 				}
 
 				/* Register it to StarPU */
-				starpu_matrix_data_register(handleptr, 0,
+				starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM,
 					(uintptr_t)*blockptr, size/nblocks,
 					size/nblocks, size/nblocks, sizeof(TYPE));
+				starpu_data_set_coordinates(*handleptr, 2, j, i);
 			}
-			else {
+			else
+			{
 				*blockptr = STARPU_POISON_PTR;
 				*handleptr = STARPU_POISON_PTR;
 			}
@@ -267,7 +276,7 @@ static void init_matrix(int rank)
 #ifdef SINGLE_TMP11
 	starpu_malloc((void **)&tmp_11_block, blocksize);
 	allocated_memory_extra += blocksize;
-	starpu_matrix_data_register(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
+	starpu_matrix_data_register(&tmp_11_block_handle, STARPU_MAIN_RAM, (uintptr_t)tmp_11_block,
 			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 #else
 	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
@@ -282,7 +291,7 @@ static void init_matrix(int rank)
 			allocated_memory_extra += blocksize;
 			STARPU_ASSERT(tmp_11_block[k]);
 
-			starpu_matrix_data_register(&tmp_11_block_handles[k], 0,
+			starpu_matrix_data_register(&tmp_11_block_handles[k], STARPU_MAIN_RAM,
 				(uintptr_t)tmp_11_block[k],
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 		}
@@ -298,7 +307,8 @@ static void init_matrix(int rank)
 
 	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
 #else
-	for (i = 0; i < 2; i++) {
+	for (i = 0; i < 2; i++)
+	{
 		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
 		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
 		tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *));
@@ -317,7 +327,7 @@ static void init_matrix(int rank)
 			allocated_memory_extra += blocksize;
 			STARPU_ASSERT(tmp_12_block[k]);
 
-			starpu_matrix_data_register(&tmp_12_block_handles[k], 0,
+			starpu_matrix_data_register(&tmp_12_block_handles[k], STARPU_MAIN_RAM,
 				(uintptr_t)tmp_12_block[k],
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 		}
@@ -328,19 +338,20 @@ static void init_matrix(int rank)
 			allocated_memory_extra += blocksize;
 			STARPU_ASSERT(tmp_21_block[k]);
 
-			starpu_matrix_data_register(&tmp_21_block_handles[k], 0,
+			starpu_matrix_data_register(&tmp_21_block_handles[k], STARPU_MAIN_RAM,
 				(uintptr_t)tmp_21_block[k],
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 		}
 #else
-	for (i = 0; i < 2; i++) {
+	for (i = 0; i < 2; i++)
+	{
 		if (tmp_12_block_is_needed(rank, nblocks, k))
 		{
 			starpu_malloc((void **)&tmp_12_block[i][k], blocksize);
 			allocated_memory_extra += blocksize;
 			STARPU_ASSERT(tmp_12_block[i][k]);
 
-			starpu_matrix_data_register(&tmp_12_block_handles[i][k], 0,
+			starpu_matrix_data_register(&tmp_12_block_handles[i][k], STARPU_MAIN_RAM,
 				(uintptr_t)tmp_12_block[i][k],
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 		}
@@ -351,7 +362,7 @@ static void init_matrix(int rank)
 			allocated_memory_extra += blocksize;
 			STARPU_ASSERT(tmp_21_block[i][k]);
 
-			starpu_matrix_data_register(&tmp_21_block_handles[i][k], 0,
+			starpu_matrix_data_register(&tmp_21_block_handles[i][k], STARPU_MAIN_RAM,
 				(uintptr_t)tmp_21_block[i][k],
 				size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 		}
@@ -412,7 +423,8 @@ int main(int argc, char **argv)
 	 *	Initialization
 	 */
 	int thread_support;
-	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
+	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS)
+	{
 		fprintf(stderr,"MPI_Init_thread failed\n");
 		exit(1);
 	}
@@ -421,8 +433,8 @@ int main(int argc, char **argv)
 	if (thread_support < MPI_THREAD_FUNNELED)
 		fprintf(stderr,"Warning: MPI does not have thread support!\n");
 
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
 
 	starpu_srand48((long int)time(NULL));
 
@@ -434,7 +446,8 @@ int main(int argc, char **argv)
 	/* We disable sequential consistency in this example */
 	starpu_data_set_default_sequential_consistency_flag(0);
 
-	starpu_mpi_init(NULL, NULL, 0);
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
 	STARPU_ASSERT(p*q == world_size);
 
@@ -459,10 +472,10 @@ int main(int argc, char **argv)
 	TYPE *a_r = NULL;
 //	STARPU_PLU(display_data_content)(a_r, size);
 
-	TYPE *x, *y;
-
 	if (check)
 	{
+		TYPE *x, *y;
+
 		x = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(x);
 
@@ -482,6 +495,9 @@ int main(int argc, char **argv)
 			STARPU_PLU(display_data_content)(a_r, size);
 
 //		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+
+		free(x);
+		free(y);
 	}
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);

+ 1 - 1
nmad/examples/mpi_lu/plu_example_double.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
nmad/examples/mpi_lu/plu_example_float.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 369 - 0
nmad/examples/mpi_lu/plu_implicit_example.c

@@ -0,0 +1,369 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+#ifdef STARPU_HAVE_LIBNUMA
+#include <numaif.h>
+#endif
+
+static unsigned long size = 4096;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static int p = 1;
+static int q = 1;
+static unsigned display = 0;
+
+#ifdef STARPU_HAVE_LIBNUMA
+static unsigned numa = 0;
+#endif
+
+static size_t allocated_memory = 0;
+static size_t allocated_memory_extra = 0;
+
+static starpu_data_handle_t *dataA_handles;
+static TYPE **dataA;
+
+int get_block_rank(unsigned i, unsigned j);
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-size") == 0)
+		{
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0)
+		{
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0)
+		{
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-numa") == 0)
+		{
+#ifdef STARPU_HAVE_LIBNUMA
+			numa = 1;
+#else
+			fprintf(stderr, "Warning: libnuma is not available\n");
+#endif
+		}
+
+		if (strcmp(argv[i], "-p") == 0)
+		{
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0)
+		{
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
+			exit(0);
+		}
+	}
+}
+
+unsigned STARPU_PLU(display_flag)(void)
+{
+	return display;
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
+{
+	const unsigned block_size = (psize/pnblocks);
+
+	unsigned i, j;
+	for (i = 0; i < block_size; i++)
+	     for (j = 0; j < block_size; j++)
+	     {
+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	     }
+}
+
+static void init_matrix(int rank)
+{
+#ifdef STARPU_HAVE_LIBNUMA
+	if (numa)
+	{
+		fprintf(stderr, "Using INTERLEAVE policy\n");
+		unsigned long nodemask = ((1<<0)|(1<<1));
+		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
+		if (ret)
+			perror("set_mempolicy failed");
+	}
+#endif
+
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
+	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			int block_rank = get_block_rank(i, j);
+			TYPE **blockptr = &dataA[j+i*nblocks];
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+
+			if (block_rank == rank)
+			{
+				/* This blocks should be treated by the current MPI process */
+				/* Allocate and fill it */
+				starpu_malloc((void **)blockptr, blocksize);
+				allocated_memory += blocksize;
+
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				fill_block_with_random(*blockptr, size, nblocks);
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				if (i == j)
+				{
+					unsigned tmp;
+					for (tmp = 0; tmp < size/nblocks; tmp++)
+					{
+						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+					}
+				}
+
+				/* Register it to StarPU */
+				starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM,
+					(uintptr_t)*blockptr, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			else
+			{
+				starpu_matrix_data_register(handleptr, -1,
+					0, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+				*blockptr = STARPU_POISON_PTR;
+			}
+			starpu_data_set_coordinates(*handleptr, 2, j, i);
+			starpu_mpi_data_register(*handleptr, j+i*nblocks, block_rank);
+		}
+	}
+
+	//display_all_blocks(nblocks, size/nblocks);
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	return dataA[j+i*nblocks];
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
+static void display_grid(int rank, unsigned pnblocks)
+{
+	if (!display)
+		return;
+
+	//if (rank == 0)
+	{
+		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
+
+		unsigned i, j;
+		for (j = 0; j < pnblocks; j++)
+		{
+			for (i = 0; i < pnblocks; i++)
+			{
+				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
+				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
+
+				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
+			}
+			fprintf(stderr, "\n");
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+
+	starpu_srand48((long int)time(NULL));
+
+	parse_args(argc, argv);
+
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_cublas_init();
+
+	/*
+	 * 	Problem Init
+	 */
+
+	init_matrix(rank);
+
+	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
+                        (int)(allocated_memory/(1024*1024)),
+			(int)(allocated_memory_extra/(1024*1024)),
+                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));
+
+	display_grid(rank, nblocks);
+
+	TYPE *a_r = NULL;
+//	STARPU_PLU(display_data_content)(a_r, size);
+
+	if (check)
+	{
+		TYPE *x, *y;
+
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			unsigned ind;
+			for (ind = 0; ind < size; ind++)
+				x[ind] = (TYPE)starpu_drand48();
+		}
+
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+
+		free(x);
+		free(y);
+	}
+
+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Report performance
+	 */
+
+	if (rank == 0)
+	{
+		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
+
+		unsigned n = size;
+		double flop = (2.0f*n*n*n)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
+
+	/*
+	 *	Test Result Correctness
+	 */
+
+	if (check)
+	{
+		/*
+		 *	Compute || A - LU ||
+		 */
+
+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
+
+#if 0
+		/*
+		 *	Compute || Ax - LUx ||
+		 */
+
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		/* Compute y2 = y2 - y */
+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
+
+		TYPE err = CPU_ASUM(size, y2, 1);
+		int max = CPU_IAMAX(size, y2, 1);
+
+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
+	}
+
+	/*
+	 * 	Termination
+	 */
+
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 19 - 0
nmad/examples/mpi_lu/plu_implicit_example_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux
+ * Copyright (C) 2010  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_implicit_example.c"

+ 19 - 0
nmad/examples/mpi_lu/plu_implicit_example_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux
+ * Copyright (C) 2010  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_implicit_example.c"

+ 402 - 0
nmad/examples/mpi_lu/plu_outofcore_example.c

@@ -0,0 +1,402 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013-2014, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+#ifdef STARPU_HAVE_LIBNUMA
+#include <numaif.h>
+#endif
+
+static unsigned long size = 4096;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static int p = 1;
+static int q = 1;
+static unsigned display = 0;
+static char *path = "./starpu-ooc-files";
+
+#ifdef STARPU_HAVE_LIBNUMA
+static unsigned numa = 0;
+#endif
+
+static size_t allocated_memory = 0;
+
+static starpu_data_handle_t *dataA_handles;
+
+int get_block_rank(unsigned i, unsigned j);
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-size") == 0)
+		{
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0)
+		{
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0)
+		{
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-numa") == 0)
+		{
+#ifdef STARPU_HAVE_LIBNUMA
+			numa = 1;
+#else
+			fprintf(stderr, "Warning: libnuma is not available\n");
+#endif
+		}
+
+		if (strcmp(argv[i], "-p") == 0)
+		{
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0)
+		{
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-path") == 0)
+		{
+			path = argv[++i];
+		}
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q] [-path PATH]\n", argv[0]);
+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
+			exit(0);
+		}
+	}
+}
+
+unsigned STARPU_PLU(display_flag)(void)
+{
+	return display;
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
+{
+	const unsigned block_size = (psize/pnblocks);
+
+	unsigned i, j;
+	for (i = 0; i < block_size; i++)
+	     for (j = 0; j < block_size; j++)
+	     {
+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	     }
+}
+
+static void create_matrix()
+{
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+	TYPE *blockptr = malloc(blocksize);
+	int fd;
+	char *filename;
+	unsigned filename_length = strlen(path) + 1 + sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1;
+
+	filename = malloc(filename_length);
+
+	allocated_memory += nblocks*nblocks*blocksize;
+
+	/* Create the whole matrix on the disk */
+	unsigned i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			fill_block_with_random(blockptr, size, nblocks);
+			if (i == j)
+			{
+				unsigned tmp;
+				for (tmp = 0; tmp < size/nblocks; tmp++)
+				{
+					blockptr[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+				}
+			}
+			snprintf(filename, filename_length, "%s/%u,%u", path, i, j);
+			fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0777);
+			if (fd < 0)
+			{
+				perror("open");
+				exit(1);
+			}
+			if (write(fd, blockptr, blocksize) != (starpu_ssize_t) blocksize)
+			{
+				fprintf(stderr,"short write");
+				exit(1);
+			}
+			if (close(fd) < 0)
+			{
+				perror("close");
+				exit(1);
+			}
+		}
+	}
+
+	free(blockptr);
+	free(filename);
+}
+
+static void init_matrix(int rank)
+{
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
+	assert(disk_node >= 0);
+
+	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			int block_rank = get_block_rank(i, j);
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+
+			if (block_rank == rank)
+			{
+				void *disk_obj;
+				snprintf(filename, sizeof(filename), "%u,%u", i, j);
+				/* Register it to StarPU */
+				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
+				if (!disk_obj)
+				{
+					fprintf(stderr,"could not open %s\n", filename);
+					exit(1);
+				}
+				starpu_matrix_data_register(handleptr, disk_node,
+					(uintptr_t) disk_obj, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			else
+			{
+				starpu_matrix_data_register(handleptr, -1,
+					0, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			starpu_data_set_coordinates(*handleptr, 2, j, i);
+			starpu_mpi_data_register(*handleptr, j+i*nblocks, block_rank);
+		}
+	}
+
+	//display_all_blocks(nblocks, size/nblocks);
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	/* This does not really make sense in out of core */
+	assert(0);
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+	int ret;
+	unsigned i, j;
+
+	starpu_srand48((long int)time(NULL));
+
+	parse_args(argc, argv);
+
+	ret = mkdir(path, 0777);
+	if (ret != 0 && errno != EEXIST)
+	{
+		fprintf(stderr,"%s does not exist\n", path);
+		exit(1);
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_cublas_init();
+
+	/*
+	 * 	Problem Init
+	 */
+
+	if (rank == 0)
+		create_matrix();
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	init_matrix(rank);
+
+	if (rank == 0)
+		fprintf(stderr, "%dMB on disk\n", (int)(allocated_memory/(1024*1024)));
+
+	TYPE *a_r = NULL;
+//	STARPU_PLU(display_data_content)(a_r, size);
+
+	if (check)
+	{
+		TYPE *x, *y;
+
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			unsigned ind;
+			for (ind = 0; ind < size; ind++)
+				x[ind] = (TYPE)starpu_drand48();
+		}
+
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+
+		free(x);
+		free(y);
+	}
+
+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Report performance
+	 */
+
+	if (rank == 0)
+	{
+		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
+
+		unsigned n = size;
+		double flop = (2.0f*n*n*n)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
+
+	/*
+	 *	Test Result Correctness
+	 */
+
+	if (check)
+	{
+		/*
+		 *	Compute || A - LU ||
+		 */
+
+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
+
+#if 0
+		/*
+		 *	Compute || Ax - LUx ||
+		 */
+
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		/* Compute y2 = y2 - y */
+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
+
+		TYPE err = CPU_ASUM(size, y2, 1);
+		int max = CPU_IAMAX(size, y2, 1);
+
+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
+	}
+
+	/*
+	 * 	Termination
+	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+		}
+	}
+
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 19 - 0
nmad/examples/mpi_lu/plu_outofcore_example_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux
+ * Copyright (C) 2010  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_outofcore_example.c"

+ 19 - 0
nmad/examples/mpi_lu/plu_outofcore_example_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux
+ * Copyright (C) 2010  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_outofcore_example.c"

+ 11 - 7
nmad/examples/mpi_lu/plu_solve.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2010, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -233,13 +233,13 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 	unsigned block_size = size/nblocks;
 
 	int rank;
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 
 	unsigned bi, bj;
 	for (bj = 0; bj < nblocks; bj++)
 	for (bi = 0; bi < nblocks; bi++)
 	{
-		TYPE *block;
+		TYPE *block = NULL;
 
 		int block_rank = get_block_rank(bi, bj);
 
@@ -247,7 +247,8 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 		{
 			block = STARPU_PLU(get_block)(bi, bj);
 		}
-		else {
+		else
+		{
 			MPI_Status status;
 
 			if (rank == 0)
@@ -257,7 +258,8 @@ TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status);
 				STARPU_ASSERT(ret == MPI_SUCCESS);
 			}
-			else if (rank == block_rank) {
+			else if (rank == block_rank)
+			{
 				block = STARPU_PLU(get_block)(bi, bj);
 				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
 				STARPU_ASSERT(ret == MPI_SUCCESS);
@@ -331,7 +333,7 @@ void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved
 	unsigned display = STARPU_PLU(display_flag)();
 
 	int rank;
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 
 	if (rank == 0)
 	{
@@ -390,4 +392,6 @@ void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved
 
 		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
 	}
+
+	free(all_r);
 }

+ 1 - 1
nmad/examples/mpi_lu/plu_solve_double.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
nmad/examples/mpi_lu/plu_solve_float.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
nmad/examples/mpi_lu/pslu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 19 - 0
nmad/examples/mpi_lu/pslu_implicit.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux
+ * Copyright (C) 2010  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "pxlu_implicit.c"

+ 1 - 1
nmad/examples/mpi_lu/pslu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 138 - 89
nmad/examples/mpi_lu/pxlu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2014  Université de Bordeaux
- * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2014, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2012, 2013, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -58,7 +58,8 @@ static unsigned nblocks = 0;
 static int rank = -1;
 static int world_size = -1;
 
-struct callback_arg {
+struct callback_arg
+{
 	unsigned i, j, k;
 };
 
@@ -104,7 +105,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 	int r;
 	for (r = 0; r < world_size; r++)
 	{
-		if (rank_mask[r]) {
+		if (rank_mask[r])
+		{
 			rank_array[cnt] = r;
 
 			comm_array[cnt] = MPI_COMM_WORLD;
@@ -120,7 +122,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 		 * once */
 		starpu_tag_notify_from_apps(tag);
 	}
-	else {
+	else
+	{
 		starpu_mpi_isend_array_detached_unlock_tag(cnt, handle_array,
 				rank_array, mpi_tag_array, comm_array, tag);
 	}
@@ -129,7 +132,8 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 /* Initiate a receive request once all dependencies are fulfilled and unlock
  * tag 'unlocked_tag' once it's done. */
 
-struct recv_when_done_callback_arg {
+struct recv_when_done_callback_arg
+{
 	int source;
 	int mpi_tag;
 	starpu_data_handle_t handle;
@@ -156,7 +160,7 @@ static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
 
 	struct recv_when_done_callback_arg *arg =
 		malloc(sizeof(struct recv_when_done_callback_arg));
-	
+
 	arg->source = source;
 	arg->mpi_tag = mpi_tag;
 	arg->handle = handle;
@@ -186,24 +190,29 @@ static void create_task_11_recv(unsigned k)
 	 * 21(k-1)i with i,j >= k */
 	unsigned ndeps = 0;
 	starpu_tag_t tag_array[2*nblocks];
-	
+
 #ifdef SINGLE_TMP11
-	unsigned i, j;
 	if (k > 0)
-	for (i = (k-1)+1; i < nblocks; i++)
 	{
-		if (rank == get_block_rank(i, k-1))
-			tag_array[ndeps++] = TAG21(k-1, i);
+		unsigned i;
+		for (i = (k-1)+1; i < nblocks; i++)
+		{
+			if (rank == get_block_rank(i, k-1))
+				tag_array[ndeps++] = TAG21(k-1, i);
+		}
 	}
 
 	if (k > 0)
-	for (j = (k-1)+1; j < nblocks; j++)
 	{
-		if (rank == get_block_rank(k-1, j))
-			tag_array[ndeps++] = TAG12(k-1, j);
+		unsigned j;
+		for (j = (k-1)+1; j < nblocks; j++)
+		{
+			if (rank == get_block_rank(k-1, j))
+				tag_array[ndeps++] = TAG12(k-1, j);
+		}
 	}
 #endif
-	
+
 	int source = get_block_rank(k, k);
 #ifdef SINGLE_TMP11
 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)();
@@ -254,7 +263,7 @@ static void callback_task_11_real(void *_arg)
 	starpu_tag_t tag = TAG11_SAVE(k);
 	int mpi_tag = MPI_TAG11(k);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
-	
+
 	free(arg);
 }
 
@@ -280,10 +289,12 @@ static void create_task_11_real(unsigned k)
 		task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG11(k), 1, STARPU_TAG_INIT);
 	}
 
@@ -296,25 +307,27 @@ static void create_task_11(unsigned k)
 	if (get_block_rank(k, k) == rank)
 	{
 #ifdef VERBOSE_INIT
-		fprintf(stderr, "CREATE real task 11(%d) (TAG11_SAVE(%d) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
+		fprintf(stderr, "CREATE real task 11(%u) (TAG11_SAVE(%u) = %llux) on node %d\n", k, k, (unsigned long long) TAG11_SAVE(k), rank);
 #endif
 		create_task_11_real(k);
 	}
-	else {
+	else
+	{
 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
 		int rank_mask[world_size];
 		find_nodes_using_11(k, rank_mask);
-		
+
 		if (rank_mask[rank])
 		{
 #ifdef VERBOSE_INIT
-			fprintf(stderr, "create RECV task 11(%d) on node %d\n", k, rank);
+			fprintf(stderr, "create RECV task 11(%u) on node %d\n", k, rank);
 #endif
 			create_task_11_recv(k);
 		}
-		else {
+		else
+		{
 #ifdef VERBOSE_INIT
-			fprintf(stderr, "Node %d needs not 11(%d)\n", rank, k);
+			fprintf(stderr, "Node %d needs not 11(%u)\n", rank, k);
 #endif
 		}
 	}
@@ -328,8 +341,6 @@ static void create_task_11(unsigned k)
 
 static void create_task_12_recv(unsigned k, unsigned j)
 {
-	unsigned i;
-
 	/* The current node is not computing that task, so we receive the block
 	 * with MPI */
 
@@ -338,23 +349,32 @@ static void create_task_12_recv(unsigned k, unsigned j)
 	 * i >= k */
 	unsigned ndeps = 0;
 	starpu_tag_t tag_array[nblocks];
-	
+
+	unsigned start;
+	unsigned bound;
+
 #ifdef SINGLE_TMP1221
-	if (k > 0)
-	for (i = (k-1)+1; i < nblocks; i++)
+	bound = 0;
+	start = (k-1)+1;
 #else
-	if (k > 1)
-	for (i = (k-2)+1; i < nblocks; i++)
+	bound = 1;
+	start = (k-2)+1;
 #endif
+
+	if (k > bound)
 	{
-		if (rank == get_block_rank(i, j))
+		unsigned i;
+		for (i = start; i < nblocks; i++)
+		{
+			if (rank == get_block_rank(i, j))
 #ifdef SINGLE_TMP1221
-			tag_array[ndeps++] = TAG22(k-1, i, j);
+				tag_array[ndeps++] = TAG22(k-1, i, j);
 #else
-			tag_array[ndeps++] = TAG22(k-2, i, j);
+				tag_array[ndeps++] = TAG22(k-2, i, j);
 #endif
+		}
 	}
-	
+
 	int source = get_block_rank(k, j);
 #ifdef SINGLE_TMP1221
 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
@@ -398,15 +418,17 @@ static void callback_task_12_real(void *_arg)
 	starpu_tag_t tag = TAG12_SAVE(k, j);
 	int mpi_tag = MPI_TAG12(k, j);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
-	
+
 	free(arg);
 }
 
 static void create_task_12_real(unsigned k, unsigned j)
 {
 	struct starpu_task *task = create_task(TAG12(k, j));
-	
+
+#ifdef STARPU_DEVEL
 #warning temporary fix :/
+#endif
 //	task->cl = &STARPU_PLU(cl12);
 	task->cl = &STARPU_PLU(cl21);
 
@@ -414,7 +436,7 @@ static void create_task_12_real(unsigned k, unsigned j)
 
 	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
 
-	starpu_tag_t tag_11_dep; 
+	starpu_tag_t tag_11_dep;
 
 	/* which sub-data is manipulated ? */
 	starpu_data_handle_t diag_block;
@@ -423,7 +445,7 @@ static void create_task_12_real(unsigned k, unsigned j)
 		diag_block = STARPU_PLU(get_block_handle)(k, k);
 		tag_11_dep = TAG11(k);
 	}
-	else 
+	else
 	{
 #ifdef SINGLE_TMP11
 		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
@@ -433,8 +455,8 @@ static void create_task_12_real(unsigned k, unsigned j)
 		tag_11_dep = TAG11_SAVE(k);
 	}
 
-	task->handles[0] = diag_block; 
-	task->handles[1] = STARPU_PLU(get_block_handle)(k, j); 
+	task->handles[0] = diag_block;
+	task->handles[1] = STARPU_PLU(get_block_handle)(k, j);
 
 	STARPU_ASSERT(get_block_rank(k, j) == rank);
 
@@ -448,15 +470,18 @@ static void create_task_12_real(unsigned k, unsigned j)
 	task->callback_func = callback_task_12_real;
 	task->callback_arg = arg;
 
-	if (!no_prio && (j == k+1)) {
+	if (!no_prio && (j == k+1))
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG12(k, j), 2, tag_11_dep, TAG22(k-1, k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG12(k, j), 1, tag_11_dep);
 	}
 
@@ -469,25 +494,27 @@ static void create_task_12(unsigned k, unsigned j)
 	if (get_block_rank(k, j) == rank)
 	{
 #ifdef VERBOSE_INIT
-		fprintf(stderr, "CREATE real task 12(k = %d, j = %d) on node %d\n", k, j, rank);
+		fprintf(stderr, "CREATE real task 12(k = %u, j = %u) on node %d\n", k, j, rank);
 #endif
 		create_task_12_real(k, j);
 	}
-	else {
+	else
+	{
 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
 		int rank_mask[world_size];
 		find_nodes_using_12(k, j, rank_mask);
-		
+
 		if (rank_mask[rank])
 		{
 #ifdef VERBOSE_INIT
-			fprintf(stderr, "create RECV task 12(k = %d, j = %d) on node %d\n", k, j, rank);
+			fprintf(stderr, "create RECV task 12(k = %u, j = %u) on node %d\n", k, j, rank);
 #endif
 			create_task_12_recv(k, j);
 		}
-		else {
+		else
+		{
 #ifdef VERBOSE_INIT
-			fprintf(stderr, "Node %d needs not 12(k=%d, i=%d)\n", rank, k, j);
+			fprintf(stderr, "Node %d needs not 12(k=%u, i=%u)\n", rank, k, j);
 #endif
 		}
 	}
@@ -499,8 +526,6 @@ static void create_task_12(unsigned k, unsigned j)
 
 static void create_task_21_recv(unsigned k, unsigned i)
 {
-	unsigned j;
-
 	/* The current node is not computing that task, so we receive the block
 	 * with MPI */
 
@@ -509,20 +534,28 @@ static void create_task_21_recv(unsigned k, unsigned i)
 	 * j >= k */
 	unsigned ndeps = 0;
 	starpu_tag_t tag_array[nblocks];
-	
+
+	unsigned bound;
+	unsigned start;
+
 #ifdef SINGLE_TMP1221
-	if (k > 0)
-	for (j = (k-1)+1; j < nblocks; j++)
+	bound = 0;
+	start = (k-1)+1;
 #else
-	if (k > 1)
-	for (j = (k-2)+1; j < nblocks; j++)
+	bound = 1;
+	start = (k-2)+1;
 #endif
+	if (k > bound)
 	{
-		if (rank == get_block_rank(i, j))
+		unsigned j;
+		for (j = start; j < nblocks; j++)
+		{
+			if (rank == get_block_rank(i, j))
 #ifdef SINGLE_TMP1221
-			tag_array[ndeps++] = TAG22(k-1, i, j);
+				tag_array[ndeps++] = TAG22(k-1, i, j);
 #else
-			tag_array[ndeps++] = TAG22(k-2, i, j);
+				tag_array[ndeps++] = TAG22(k-2, i, j);
+		}
 #endif
 	}
 
@@ -570,7 +603,7 @@ static void callback_task_21_real(void *_arg)
 	starpu_tag_t tag = TAG21_SAVE(k, i);
 	int mpi_tag = MPI_TAG21(k, i);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
-	
+
 	free(arg);
 }
 
@@ -578,7 +611,9 @@ static void create_task_21_real(unsigned k, unsigned i)
 {
 	struct starpu_task *task = create_task(TAG21(k, i));
 
-#warning temporary fix 
+#ifdef STARPU_DEVEL
+#warning temporary fix
+#endif
 //	task->cl = &STARPU_PLU(cl21);
 	task->cl = &STARPU_PLU(cl12);
 
@@ -586,8 +621,8 @@ static void create_task_21_real(unsigned k, unsigned i)
 
 	unsigned diag_block_is_local = (get_block_rank(k, k) == rank);
 
-	starpu_tag_t tag_11_dep; 
-	
+	starpu_tag_t tag_11_dep;
+
 	/* which sub-data is manipulated ? */
 	starpu_data_handle_t diag_block;
 	if (diag_block_is_local)
@@ -595,7 +630,7 @@ static void create_task_21_real(unsigned k, unsigned i)
 		diag_block = STARPU_PLU(get_block_handle)(k, k);
 		tag_11_dep = TAG11(k);
 	}
-	else 
+	else
 	{
 #ifdef SINGLE_TMP11
 		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
@@ -605,7 +640,7 @@ static void create_task_21_real(unsigned k, unsigned i)
 		tag_11_dep = TAG11_SAVE(k);
 	}
 
-	task->handles[0] = diag_block; 
+	task->handles[0] = diag_block;
 	task->handles[1] = STARPU_PLU(get_block_handle)(i, k);
 
 	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
@@ -618,15 +653,18 @@ static void create_task_21_real(unsigned k, unsigned i)
 	task->callback_func = callback_task_21_real;
 	task->callback_arg = arg;
 
-	if (!no_prio && (i == k+1)) {
+	if (!no_prio && (i == k+1))
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG21(k, i), 2, tag_11_dep, TAG22(k-1, i, k));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG21(k, i), 1, tag_11_dep);
 	}
 
@@ -639,25 +677,27 @@ static void create_task_21(unsigned k, unsigned i)
 	if (get_block_rank(i, k) == rank)
 	{
 #ifdef VERBOSE_INIT
-		fprintf(stderr, "CREATE real task 21(k = %d, i = %d) on node %d\n", k, i, rank);
+		fprintf(stderr, "CREATE real task 21(k = %u, i = %u) on node %d\n", k, i, rank);
 #endif
 		create_task_21_real(k, i);
 	}
-	else {
+	else
+	{
 		/* We don't handle the task, but perhaps we have to generate MPI transfers. */
 		int rank_mask[world_size];
 		find_nodes_using_21(k, i, rank_mask);
-		
+
 		if (rank_mask[rank])
 		{
 #ifdef VERBOSE_INIT
-			fprintf(stderr, "create RECV task 21(k = %d, i = %d) on node %d\n", k, i, rank);
+			fprintf(stderr, "create RECV task 21(k = %u, i = %u) on node %d\n", k, i, rank);
 #endif
 			create_task_21_recv(k, i);
 		}
-		else {
+		else
+		{
 #ifdef VERBOSE_INIT
-			fprintf(stderr, "Node %d needs not 21(k=%d, i=%d)\n", rank, k,i);
+			fprintf(stderr, "Node %d needs not 21(k=%u, i=%u)\n", rank, k,i);
 #endif
 		}
 	}
@@ -679,7 +719,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
 	/* which sub-data is manipulated ? */
 
-	/* produced by TAG21_SAVE(k, i) */ 
+	/* produced by TAG21_SAVE(k, i) */
 	unsigned block21_is_local = (get_block_rank(i, k) == rank);
 	starpu_tag_t tag_21_dep;
 
@@ -689,7 +729,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 		block21 = STARPU_PLU(get_block_handle)(i, k);
 		tag_21_dep = TAG21(k, i);
 	}
-	else 
+	else
 	{
 #ifdef SINGLE_TMP1221
 		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
@@ -710,7 +750,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 		block12 = STARPU_PLU(get_block_handle)(k, j);
 		tag_12_dep = TAG12(k, j);
 	}
-	else 
+	else
 	{
 #ifdef SINGLE_TMP1221
 		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
@@ -722,7 +762,9 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
 
 
+#ifdef STARPU_DEVEL
 #warning temporary fix :/
+#endif
 	//task->handles[0] = block21;
 	task->handles[0] = block12;
 
@@ -736,15 +778,18 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
 	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
 
-	if (!no_prio && (i == k + 1) && (j == k +1) ) {
+	if (!no_prio && (i == k + 1) && (j == k +1) )
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), tag_12_dep, tag_21_dep);
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, tag_12_dep, tag_21_dep);
 	}
 
@@ -759,7 +804,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 	//	fprintf(stderr, "CREATE real task 22(k = %d, i = %d, j = %d) on node %d\n", k, i, j, rank);
 		create_task_22_real(k, i, j);
 	}
-//	else {
+//	else
+//	{
 //		fprintf(stderr, "Node %d needs not 22(k=%d, i=%d, j = %d)\n", rank, k,i,j);
 //	}
 }
@@ -787,7 +833,7 @@ static void wait_termination(void)
 			starpu_data_handle_t diag_block = STARPU_PLU(get_block_handle)(k, k);
 			wait_tag_and_fetch_handle(TAG11_SAVE(k), diag_block);
 		}
-		
+
 
 		for (i = k + 1; i < nblocks; i++)
 		{
@@ -812,11 +858,11 @@ static void wait_termination(void)
 				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
 			}
 		}
-	}	
+	}
 }
 
 /*
- *	code to bootstrap the factorization 
+ *	code to bootstrap the factorization
  */
 
 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
@@ -833,6 +879,8 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 	for (k = 0; k < nblocks; k++)
 	{
+		starpu_iteration_push(k);
+
 		create_task_11(k);
 
 		for (i = k+1; i<nblocks; i++)
@@ -848,6 +896,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 				create_task_22(k, i, j);
 			}
 		}
+		starpu_iteration_pop();
 	}
 
 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
@@ -859,12 +908,12 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 	starpu_tag_notify_from_apps(STARPU_TAG_INIT);
 
 	wait_termination();
-	
+
 	end = starpu_timing_now();
 
 	double timing = end - start;
-	
+
 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
-	
+
 	return timing;
 }

+ 3 - 2
nmad/examples/mpi_lu/pxlu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2014  Université de Bordeaux
- * Copyright (C) 2010, 2012, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,7 +31,8 @@
 //#define SINGLE_TMP11	1
 //#define SINGLE_TMP1221	1
 
-struct debug_info {
+struct debug_info
+{
 	unsigned i;
 	unsigned j;
 	unsigned k;

+ 184 - 0
nmad/examples/mpi_lu/pxlu_implicit.c

@@ -0,0 +1,184 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013-2015, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2012, 2013, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "pxlu.h"
+#include "pxlu_kernels.h"
+#include <sys/time.h>
+
+//#define VERBOSE_INIT	1
+
+//#define DEBUG	1
+
+static unsigned no_prio = 0;
+
+static unsigned nblocks = 0;
+static int rank = -1;
+static int world_size = -1;
+
+struct callback_arg
+{
+	unsigned i, j, k;
+};
+
+/*
+ *	Task 11 (diagonal factorization)
+ */
+
+static void create_task_11(unsigned k)
+{
+	starpu_mpi_task_insert(MPI_COMM_WORLD,
+			       &STARPU_PLU(cl11),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
+			       STARPU_PRIORITY, !no_prio ?
+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			       0);
+}
+
+/*
+ *	Task 12 (Update lower left (TRSM))
+ */
+
+static void create_task_12(unsigned k, unsigned j)
+{
+#ifdef STARPU_DEVEL
+#warning temporary fix 
+#endif
+	starpu_mpi_task_insert(MPI_COMM_WORLD,
+			       //&STARPU_PLU(cl12),
+			       &STARPU_PLU(cl21),
+			       STARPU_VALUE, &j, sizeof(j),
+			       STARPU_VALUE, &j, sizeof(j),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_R, STARPU_PLU(get_block_handle)(k, k),
+			       STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
+			       STARPU_PRIORITY, !no_prio && (j == k+1) ?
+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			       0);
+}
+
+/*
+ *	Task 21 (Update upper right (TRSM))
+ */
+
+static void create_task_21(unsigned k, unsigned i)
+{
+#ifdef STARPU_DEVEL
+#warning temporary fix 
+#endif
+	starpu_mpi_task_insert(MPI_COMM_WORLD,
+			       //&STARPU_PLU(cl21),
+			       &STARPU_PLU(cl12),
+			       STARPU_VALUE, &i, sizeof(i),
+			       STARPU_VALUE, &i, sizeof(i),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_R, STARPU_PLU(get_block_handle)(k, k),
+			       STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
+			       STARPU_PRIORITY, !no_prio && (i == k+1) ?
+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			       0);
+}
+
+/*
+ *	Task 22 (GEMM)
+ */
+
+static void create_task_22(unsigned k, unsigned i, unsigned j)
+{
+	starpu_mpi_task_insert(MPI_COMM_WORLD,
+			       &STARPU_PLU(cl22),
+			       STARPU_VALUE, &i, sizeof(i),
+			       STARPU_VALUE, &j, sizeof(j),
+			       STARPU_VALUE, &k, sizeof(k),
+			       STARPU_R, STARPU_PLU(get_block_handle)(k, j),
+			       STARPU_R, STARPU_PLU(get_block_handle)(i, k),
+			       STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
+			       STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
+			       STARPU_MAX_PRIO : STARPU_MIN_PRIO,
+			       0);
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
+{
+	double start;
+	double end;
+
+	nblocks = _nblocks;
+	rank = _rank;
+	world_size = _world_size;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	start = starpu_timing_now();
+
+	for (k = 0; k < nblocks; k++)
+	{
+		starpu_iteration_push(k);
+
+		create_task_11(k);
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(k, i);
+			create_task_21(k, i);
+		}
+
+		starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(k,k));
+		if (get_block_rank(k, k) == _rank)
+			starpu_data_wont_use(STARPU_PLU(get_block_handle)(k,k));
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(k, i, j);
+			}
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(k,i));
+			if (get_block_rank(k, i) == _rank)
+				starpu_data_wont_use(STARPU_PLU(get_block_handle)(k,i));
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(i,k));
+			if (get_block_rank(i, k) == _rank)
+				starpu_data_wont_use(STARPU_PLU(get_block_handle)(i,k));
+		}
+		starpu_iteration_pop();
+	}
+
+	starpu_task_wait_for_all();
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	end = starpu_timing_now();
+
+	double timing = end - start;
+	
+//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
+	
+	return timing;
+}

+ 37 - 25
nmad/examples/mpi_lu/pxlu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,8 +44,8 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 	struct debug_info *info = _args;
 
 	int rank;
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d\n", rank, info->k, info->i, info->j);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 22 %d - k = %u i = %u j = %u\n", rank, info->k, info->i, info->j);
 #endif
 
 #ifdef STARPU_USE_CUDA
@@ -53,7 +53,8 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 	cudaError_t cures;
 #endif
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 			CPU_GEMM("N", "N", dy, dx, dz,
 				(TYPE)-1.0, right, ld21, left, ld12,
@@ -80,7 +81,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 			break;
 	}
 #ifdef VERBOSE_KERNELS
-	fprintf(stderr, "KERNEL 22 %d - k = %d i = %d j = %d done\n", rank, info->k, info->i, info->j);
+	fprintf(stderr, "KERNEL 22 %d - k = %u i = %u j = %u done\n", rank, info->k, info->i, info->j);
 #endif
 }
 
@@ -96,7 +97,8 @@ static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
 }
 #endif// STARPU_USE_CUDA
 
-static struct starpu_perfmodel STARPU_PLU(model_22) = {
+static struct starpu_perfmodel STARPU_PLU(model_22) =
+{
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
@@ -107,7 +109,8 @@ static struct starpu_perfmodel STARPU_PLU(model_22) = {
 #endif
 };
 
-struct starpu_codelet STARPU_PLU(cl22) = {
+struct starpu_codelet STARPU_PLU(cl22) =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {STARPU_PLU(cpu_u22)},
 #ifdef STARPU_USE_CUDA
@@ -142,10 +145,10 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 	struct debug_info *info = _args;
 
 	int rank;
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 #warning fixed debugging according to other tweak
-	//fprintf(stderr, "KERNEL 12 %d - k = %d i %d\n", rank, info->k, info->i);
-	fprintf(stderr, "KERNEL 21 %d - k = %d i %d\n", rank, info->k, info->j);
+	//fprintf(stderr, "KERNEL 12 %d - k = %u i %u\n", rank, info->k, info->i);
+	fprintf(stderr, "KERNEL 21 %d - k = %u i %u\n", rank, info->k, info->j);
 
 	//fprintf(stderr, "INPUT 12 U11\n");
 	fprintf(stderr, "INPUT 21 U11\n");
@@ -161,7 +164,8 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 #endif
 
 	/* solve L11 U12 = A12 (find U12) */
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
 					(TYPE)1.0, sub11, ld11, sub12, ld12);
@@ -204,7 +208,8 @@ static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
 }
 #endif // STARPU_USE_CUDA
 
-static struct starpu_perfmodel STARPU_PLU(model_12) = {
+static struct starpu_perfmodel STARPU_PLU(model_12) =
+{
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
@@ -215,7 +220,8 @@ static struct starpu_perfmodel STARPU_PLU(model_12) = {
 #endif
 };
 
-struct starpu_codelet STARPU_PLU(cl12) = {
+struct starpu_codelet STARPU_PLU(cl12) =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {STARPU_PLU(cpu_u12)},
 #ifdef STARPU_USE_CUDA
@@ -250,10 +256,10 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 	struct debug_info *info = _args;
 
 	int rank;
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 #warning fixed debugging according to other tweak
-	//fprintf(stderr, "KERNEL 21 %d (k = %d, i = %d)\n", rank, info->k, info->i);
-	fprintf(stderr, "KERNEL 12 %d (k = %d, j = %d)\n", rank, info->k, info->j);
+	//fprintf(stderr, "KERNEL 21 %d (k = %u, i = %u)\n", rank, info->k, info->i);
+	fprintf(stderr, "KERNEL 12 %d (k = %u, j = %u)\n", rank, info->k, info->j);
 
 	//fprintf(stderr, "INPUT 21 U11\n");
 	fprintf(stderr, "INPUT 12 U11\n");
@@ -268,7 +274,8 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 #endif
 
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
 					(TYPE)1.0, sub11, ld11, sub21, ld21);
@@ -313,7 +320,8 @@ static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
 }
 #endif
 
-static struct starpu_perfmodel STARPU_PLU(model_21) = {
+static struct starpu_perfmodel STARPU_PLU(model_21) =
+{
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
@@ -324,7 +332,8 @@ static struct starpu_perfmodel STARPU_PLU(model_21) = {
 #endif
 };
 
-struct starpu_codelet STARPU_PLU(cl21) = {
+struct starpu_codelet STARPU_PLU(cl21) =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {STARPU_PLU(cpu_u21)},
 #ifdef STARPU_USE_CUDA
@@ -356,11 +365,12 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 	struct debug_info *info = _args;
 
 	int rank;
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 11 %d - k = %u\n", rank, info->k);
 #endif
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 			for (z = 0; z < nx; z++)
 			{
@@ -403,7 +413,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 			break;
 	}
 #ifdef VERBOSE_KERNELS
-	fprintf(stderr, "KERNEL 11 %d - k = %d\n", rank, info->k);
+	fprintf(stderr, "KERNEL 11 %d - k = %u\n", rank, info->k);
 #endif
 }
 
@@ -419,7 +429,8 @@ static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
 }
 #endif// STARPU_USE_CUDA
 
-static struct starpu_perfmodel STARPU_PLU(model_11) = {
+static struct starpu_perfmodel STARPU_PLU(model_11) =
+{
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
@@ -430,7 +441,8 @@ static struct starpu_perfmodel STARPU_PLU(model_11) = {
 #endif
 };
 
-struct starpu_codelet STARPU_PLU(cl11) = {
+struct starpu_codelet STARPU_PLU(cl11) =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {STARPU_PLU(cpu_u11)},
 #ifdef STARPU_USE_CUDA

+ 1 - 1
nmad/examples/mpi_lu/pxlu_kernels.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012, 2014  Université de Bordeaux
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
nmad/examples/mpi_lu/slu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 108 - 0
nmad/examples/native_fortran/nf_basic_ring.f90

@@ -0,0 +1,108 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016  Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+program nf_basic_ring
+        use iso_c_binding       ! C interfacing module
+        use fstarpu_mod         ! StarPU interfacing module
+        use fstarpu_mpi_mod     ! StarPU-MPI interfacing module
+        implicit none
+
+        integer(c_int) :: ncpu
+        integer(c_int) :: ret
+        integer(c_int) :: rank,sz
+        integer(c_int),target :: token = 42
+        integer(c_int) :: nloops = 32
+        integer(c_int) :: loop
+        integer(c_int) :: tag
+        integer(c_int) :: world
+        integer(c_int) :: src,dst
+        type(c_ptr) :: token_dh, st
+
+        ret = fstarpu_init(C_NULL_PTR)
+        if (ret == -19) then
+                stop 77
+        else if (ret /= 0) then
+                stop 1
+        end if
+
+        ret = fstarpu_mpi_init(1)
+        print *,"fstarpu_mpi_init status:", ret
+        if (ret /= 0) then
+                stop 1
+        end if
+
+        ! stop there if no CPU worker available
+        ncpu = fstarpu_cpu_worker_get_count()
+        if (ncpu == 0) then
+                call fstarpu_shutdown()
+                ret = fstarpu_mpi_shutdown()
+                stop 77
+        end if
+
+        world = fstarpu_mpi_world_comm()
+        rank = fstarpu_mpi_world_rank()
+        sz = fstarpu_mpi_world_size()
+        write(*,*) "rank=", rank,"size=",sz,"world=",world
+        if (sz < 2) then
+                call fstarpu_shutdown()
+                ret = fstarpu_mpi_shutdown()
+                stop 77
+        end if
+
+        call fstarpu_variable_data_register(token_dh, 0, c_loc(token), c_sizeof(token))
+
+        st = fstarpu_mpi_status_alloc()
+        do loop=1,nloops
+                tag = loop*sz+rank
+                token = 0
+                if (loop == 1.and.rank == 0) then
+                        write(*,*) "rank=", rank,"token=",token
+                else
+                        src = modulo((rank+sz-1),sz)
+                        write(*,*) "rank=", rank,"recv--> src =", src, "tag =", tag
+                        ret = fstarpu_mpi_recv(token_dh, src, tag, world, st)
+                        if (ret /= 0) then
+                                write(*,*) "fstarpu_mpi_recv failed"
+                                stop 1
+                        end if
+                        write(*,*) "rank=", rank,"recv<--","token=",token
+                        token = token+1
+                end if
+                if (loop == nloops.and.rank == (sz-1)) then
+                        call fstarpu_data_acquire(token_dh, FSTARPU_R)
+                        write(*,*) "finished: rank=", rank,"token=",token
+                        call fstarpu_data_release(token_dh)
+                else
+                        dst = modulo((rank+1),sz)
+                        write(*,*) "rank=", rank,"send--> dst =", dst, "tag =", tag+1
+                        ret = fstarpu_mpi_send(token_dh, dst, tag+1, world)
+                        if (ret /= 0) then
+                                write(*,*) "fstarpu_mpi_recv failed"
+                                stop 1
+                        end if
+                        write(*,*) "rank=", rank,"send<--"
+                end if
+        end do
+        call fstarpu_mpi_status_free(st)
+        call fstarpu_data_unregister(token_dh)
+        call fstarpu_shutdown()
+
+        ret = fstarpu_mpi_shutdown()
+        print *,"fstarpu_mpi_shutdown status:", ret
+        if (ret /= 0) then
+                stop 1
+        end if
+end program nf_basic_ring
+

+ 236 - 0
nmad/examples/native_fortran/nf_mm.f90

@@ -0,0 +1,236 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016  Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+program nf_mm
+        use iso_c_binding       ! C interfacing module
+        use fstarpu_mod         ! StarPU interfacing module
+        use fstarpu_mpi_mod     ! StarPU-MPI interfacing module
+        use nf_mm_cl
+        implicit none
+
+        logical, parameter :: verbose = .false.
+        integer(c_int) :: comm_rank, comm_size, comm_world
+        integer(c_int) :: N = 16, BS = 4, NB
+        real(kind=c_double),allocatable,target :: A(:,:), B(:,:), C(:,:)
+        type(c_ptr),allocatable :: dh_A(:), dh_B(:), dh_C(:,:)
+        type(c_ptr) :: cl_mm
+        integer(c_int) :: ncpu
+        integer(c_int) :: ret
+        integer(c_int) :: row, col
+        integer(c_int) :: b_row, b_col
+        integer(c_int) :: mr, tag, rank
+
+        ret = fstarpu_init(C_NULL_PTR)
+        if (ret == -19) then
+                stop 77
+        else if (ret /= 0) then
+                stop 1
+        end if
+
+        ret = fstarpu_mpi_init(1)
+        print *,"fstarpu_mpi_init status:", ret
+        if (ret /= 0) then
+                stop 1
+        end if
+
+        ! stop there if no CPU worker available
+        ncpu = fstarpu_cpu_worker_get_count()
+        if (ncpu == 0) then
+                call fstarpu_shutdown()
+                stop 77
+        end if
+
+        comm_world = fstarpu_mpi_world_comm()
+        comm_size = fstarpu_mpi_world_size()
+        comm_rank = fstarpu_mpi_world_rank()
+
+        if (comm_size < 2) then
+                call fstarpu_shutdown()
+                ret = fstarpu_mpi_shutdown()
+                stop 77
+        end if
+
+        ! TODO: process app's argc/argv
+        NB = N/BS
+
+        ! allocate and initialize codelet
+        cl_mm = fstarpu_codelet_allocate()
+        call fstarpu_codelet_set_name(cl_mm, c_char_"nf_mm_cl"//c_null_char)
+        call fstarpu_codelet_add_cpu_func(cl_mm, C_FUNLOC(cl_cpu_mult))
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_R)
+        call fstarpu_codelet_add_buffer(cl_mm, FSTARPU_RW)
+
+        ! allocate matrices
+        if (comm_rank == 0) then
+                allocate(A(N,N))
+                allocate(B(N,N))
+                allocate(C(N,N))
+        end if
+
+        ! init matrices
+        if (comm_rank == 0) then
+                do col=1,N
+                do row=1,N
+                if (row == col) then
+                        A(row,col) = 2
+                else
+                        A(row,col) = 0
+                end if
+                B(row,col) = row*N+col
+                C(row,col) = 0
+                end do
+                end do
+
+                if (verbose) then
+                        print *,"A"
+                        call mat_disp(A)
+                        print *,"B"
+                        call mat_disp(B)
+                        print *,"C"
+                        call mat_disp(C)
+                end if
+        end if
+
+        ! allocate data handles
+        allocate(dh_A(NB))
+        allocate(dh_B(NB))
+        allocate(dh_C(NB,NB))
+
+        ! register matrices
+        if (comm_rank == 0) then
+                mr = 0 ! TODO: use STARPU_MAIN_RAM constant
+        else
+                mr = -1
+        end if
+        tag = 0
+
+        do b_row=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &
+                                c_loc( A(1+(b_row-1)*BS,1) ), N, BS, N, c_sizeof(A(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_A(b_row), mr, &
+                                c_null_ptr, N, BS, N, c_sizeof(A(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_A(b_row), tag, 0)
+                tag = tag+1
+        end do
+
+        do b_col=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &
+                                c_loc( B(1,1+(b_col-1)*BS) ), N, N, BS, c_sizeof(B(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_B(b_col), mr, &
+                                c_null_ptr, N, N, BS, c_sizeof(B(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_B(b_col), tag, 0)
+                tag = tag+1
+        end do
+
+        do b_col=1,NB
+        do b_row=1,NB
+                if (comm_rank == 0) then
+                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
+                                c_loc( C(1+(b_row-1)*BS,1+(b_col-1)*BS) ), N, BS, BS, c_sizeof(C(1,1)))
+                else
+                        call fstarpu_matrix_data_register(dh_C(b_row,b_col), mr, &
+                                c_null_ptr, N, BS, BS, c_sizeof(C(1,1)))
+                end if
+                call fstarpu_mpi_data_register(dh_C(b_row,b_col), tag, 0)
+                tag = tag+1
+        end do
+        end do
+
+        ! distribute matrix C
+        do b_col=1,NB
+        do b_row=1,NB
+        rank = modulo(b_row+b_col, comm_size)
+        call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), rank)
+        end do
+        end do
+
+        do b_col=1,NB
+        do b_row=1,NB
+                ret = fstarpu_mpi_task_insert(comm_world, (/ cl_mm, &
+                        FSTARPU_R,  dh_A(b_row), &
+                        FSTARPU_R,  dh_B(b_col), &
+                        FSTARPU_RW, dh_C(b_row,b_col), &
+                        C_NULL_PTR /))
+        end do
+        end do
+
+        call fstarpu_task_wait_for_all()
+
+        ! undistribute matrix C
+        do b_col=1,NB
+        do b_row=1,NB
+        call fstarpu_mpi_data_migrate(comm_world, dh_c(b_row,b_col), 0)
+        end do
+        end do
+
+        ! unregister matrices
+        do b_row=1,NB
+                call fstarpu_data_unregister(dh_A(b_row))
+        end do
+
+        do b_col=1,NB
+                call fstarpu_data_unregister(dh_B(b_col))
+        end do
+
+        do b_col=1,NB
+        do b_row=1,NB
+                call fstarpu_data_unregister(dh_C(b_row,b_col))
+        end do
+        end do
+
+        ! check result
+        if (comm_rank == 0) then
+                if (verbose) then
+                        print *,"final C"
+                        call mat_disp(C)
+                end if
+
+                do col=1,N
+                do row=1,N
+                if (abs(C(row,col) - 2*(row*N+col)) > 1.0) then
+                        print *, "check failed"
+                        stop 1
+                end if
+                end do
+                end do
+        end if
+
+        ! free handles
+        deallocate(dh_A)
+        deallocate(dh_B)
+        deallocate(dh_C)
+
+        ! free matrices
+        if (comm_rank == 0) then
+                deallocate(A)
+                deallocate(B)
+                deallocate(C)
+        end if
+        call fstarpu_codelet_free(cl_mm)
+        call fstarpu_shutdown()
+
+        ret = fstarpu_mpi_shutdown()
+        print *,"fstarpu_mpi_shutdown status:", ret
+        if (ret /= 0) then
+                stop 1
+        end if
+end program nf_mm

+ 90 - 0
nmad/examples/native_fortran/nf_mm_cl.f90

@@ -0,0 +1,90 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016  Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+module nf_mm_cl
+contains
+subroutine mat_disp (m)
+        ! declared here so it can be used both for the
+        ! program and for debugging codelet routines
+
+        use iso_c_binding       ! C interfacing module
+        implicit none
+        real(kind=c_double) :: m(:,:)
+        integer i,j
+
+        do i=lbound(m,1),ubound(m,1)
+                write(*, fmt="(A2) ",advance="no") "| "
+        do j=lbound(m,2),ubound(m,2)
+                write(*, fmt="(F6.1,A1) ", advance="no") m(i,j)," "
+        end do
+                write(*,*) "|"
+        end do
+        write(*,*)
+
+end subroutine
+
+recursive subroutine cl_cpu_mult (buffers, cl_args) bind(C)
+        use iso_c_binding       ! C interfacing module
+        use fstarpu_mod         ! StarPU interfacing module
+        implicit none
+
+        type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+        real(kind=c_double),pointer :: A(:,:), B(:,:), C(:,:)
+        integer :: ld_A,nx_A,ny_A
+        integer :: ld_B,nx_B,ny_B
+        integer :: ld_C,nx_C,ny_C
+        integer :: i,j,k
+
+        ld_A = fstarpu_matrix_get_ld(buffers, 0)
+        ld_B = fstarpu_matrix_get_ld(buffers, 1)
+        ld_C = fstarpu_matrix_get_ld(buffers, 2)
+
+        nx_A = fstarpu_matrix_get_nx(buffers, 0)
+        nx_B = fstarpu_matrix_get_nx(buffers, 1)
+        nx_C = fstarpu_matrix_get_nx(buffers, 2)
+
+        ny_A = fstarpu_matrix_get_ny(buffers, 0)
+        ny_B = fstarpu_matrix_get_ny(buffers, 1)
+        ny_C = fstarpu_matrix_get_ny(buffers, 2)
+
+        if (ny_C /= ny_B) then
+                write(*,*) "C -- B column mismatch"
+                stop 1
+        end if
+
+        if (nx_C /= nx_A) then
+                write(*,*) "C -- A row mismatch"
+                stop 1
+        end if
+
+        if (ny_A /= nx_B) then
+                write(*,*) "A -- B col/row mismatch"
+                stop 1
+        end if
+
+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 0), A, shape=[ld_A,ny_A])
+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 1), B, shape=[ld_B,ny_B])
+        call c_f_pointer(fstarpu_matrix_get_ptr(buffers, 2), C, shape=[ld_C,ny_C])
+
+        do k = 1, ny_C
+        do j = 1, nx_C
+        do i = 1, nx_B
+                C(j,k) = C(j,k) + A(j,i) * B(i,k)
+        end do
+        end do
+        end do
+
+end subroutine cl_cpu_mult
+end module nf_mm_cl

+ 1 - 1
nmad/examples/perf.sh

@@ -3,7 +3,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
 # Copyright (C) 2010  Université de Bordeaux
-# Copyright (C) 2010  Centre National de la Recherche Scientifique
+# Copyright (C) 2010  CNRS
 # 
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by

+ 56 - 27
nmad/examples/stencil/stencil5.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015              Université Bordeaux
- * Copyright (C) 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2013, 2015-2017              Université Bordeaux
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,7 +20,7 @@
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 #define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
-    						int _disp_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
+    						int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
                                                 fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
                                                 fflush(ofile); }} while(0);
 
@@ -37,15 +37,35 @@ void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 //	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
 }
 
+/* Dumb performance model for simgrid */
+static double stencil5_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
+}
+
+static struct starpu_perfmodel stencil5_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = stencil5_cost_function,
+	.symbol = "stencil5"
+};
+
 struct starpu_codelet stencil5_cl =
 {
 	.cpu_funcs = {stencil5_cpu},
 	.nbuffers = 5,
-	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R},
+	.model = &stencil5_model
 };
 
 #ifdef STARPU_QUICK_CHECK
-#  define NITER_DEF	100
+#  define NITER_DEF	10
+#  define X         	2
+#  define Y         	2
+#elif !defined(STARPU_LONG_CHECK)
+#  define NITER_DEF	10
 #  define X         	5
 #  define Y         	5
 #else
@@ -96,9 +116,18 @@ int main(int argc, char **argv)
 
 	int ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	starpu_mpi_init(&argc, &argv, 1);
-	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &size);
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+
+	if (starpu_cpu_worker_get_count() == 0)
+	{
+		FPRINTF(stderr, "We need at least 1 CPU worker.\n");
+		starpu_mpi_shutdown();
+		starpu_shutdown();
+		return 77;
+	}
 
 	parse_args(argc, argv);
 
@@ -136,14 +165,14 @@ int main(int argc, char **argv)
 			int mpi_rank = my_distrib(x, y, size);
 			if (mpi_rank == my_rank)
 			{
-				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
 				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(float));
 			}
 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
 				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
 			{
 				/* I don't own that index, but will need it for my computations */
-				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
 				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
 			}
 			else
@@ -153,6 +182,7 @@ int main(int argc, char **argv)
 			}
 			if (data_handles[x][y])
 			{
+				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
 				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
 			}
 		}
@@ -161,18 +191,21 @@ int main(int argc, char **argv)
 	/* First computation with initial distribution */
 	for(loop=0 ; loop<niter; loop++)
 	{
+		starpu_iteration_push(loop);
+
 		for (x = 1; x < X-1; x++)
 		{
 			for (y = 1; y < Y-1; y++)
 			{
-				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
 						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
 						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
 						       0);
 			}
 		}
+		starpu_iteration_pop();
 	}
-	fprintf(stderr, "Waiting ...\n");
+	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();
 
 	/* Now migrate data to a new distribution */
@@ -192,33 +225,31 @@ int main(int argc, char **argv)
 				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
 			}
 			if (data_handles[x][y] && mpi_rank != starpu_mpi_data_get_rank(data_handles[x][y]))
-			{
 				/* Migrate the data */
-				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
-				/* And register new rank of the matrix */
-				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
-			}
+				starpu_mpi_data_migrate(MPI_COMM_WORLD, data_handles[x][y], mpi_rank);
 		}
 	}
 
 	/* Second computation with new distribution */
 	for(loop=0 ; loop<niter; loop++)
 	{
+		starpu_iteration_push(niter + loop);
+
 		for (x = 1; x < X-1; x++)
 		{
 			for (y = 1; y < Y-1; y++)
 			{
-				starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
 						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
 						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
 						       0);
 			}
 		}
+		starpu_iteration_pop();
 	}
-	fprintf(stderr, "Waiting ...\n");
+	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();
 
-
 	/* Unregister data */
 	for(x = 0; x < X; x++)
 	{
@@ -228,9 +259,7 @@ int main(int argc, char **argv)
 			{
 				int mpi_rank = my_distrib(x, y, size);
 				/* Get back data to original place where the user-provided buffer is. */
-				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
-				/* Register original rank of the matrix (although useless) */
-				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
+				starpu_mpi_data_migrate(MPI_COMM_WORLD, data_handles[x][y], mpi_rank);
 				/* And unregister it */
 				starpu_data_unregister(data_handles[x][y]);
 			}
@@ -242,15 +271,15 @@ int main(int argc, char **argv)
 
 	if (display)
 	{
-		fprintf(stdout, "[%d] mean=%2.2f\n", my_rank, mean);
+		FPRINTF(stdout, "[%d] mean=%2.2f\n", my_rank, mean);
 		for(x = 0; x < X; x++)
 		{
-			fprintf(stdout, "[%d] ", my_rank);
+			FPRINTF(stdout, "[%d] ", my_rank);
 			for (y = 0; y < Y; y++)
 			{
-				fprintf(stdout, "%2.2f ", matrix[x][y]);
+				FPRINTF(stdout, "%2.2f ", matrix[x][y]);
 			}
-			fprintf(stdout, "\n");
+			FPRINTF(stdout, "\n");
 		}
 	}
 

+ 229 - 0
nmad/examples/user_datatype/my_interface.c

@@ -0,0 +1,229 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015, 2016, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#include "my_interface.h"
+
+void starpu_my_interface_display_codelet_cpu(void *descr[], void *_args)
+{
+	char c = STARPU_MY_INTERFACE_GET_CHAR(descr[0]);
+	int d = STARPU_MY_INTERFACE_GET_INT(descr[0]);
+	char msg[100];
+
+	if (_args)
+		starpu_codelet_unpack_args(_args, &msg);
+
+	fprintf(stderr, "[%s] My value = '%c' %d\n", _args?msg:NULL, c, d);
+}
+
+void starpu_my_interface_compare_codelet_cpu(void *descr[], void *_args)
+{
+	int *compare;
+
+	starpu_codelet_unpack_args(_args, &compare);
+
+	int d0 = STARPU_MY_INTERFACE_GET_INT(descr[0]);
+	char c0 = STARPU_MY_INTERFACE_GET_CHAR(descr[0]);
+	int d1 = STARPU_MY_INTERFACE_GET_INT(descr[1]);
+	char c1 = STARPU_MY_INTERFACE_GET_CHAR(descr[1]);
+
+	*compare = (d0 == d1 && c0 == c1);
+}
+
+static struct starpu_my_interface *myinterface = NULL;
+
+void _starpu_my_interface_datatype_allocate(MPI_Datatype *mpi_datatype)
+{
+	int ret;
+
+	int blocklengths[2] = {1, 1};
+	MPI_Aint displacements[2];
+	MPI_Datatype types[2] = {MPI_INT, MPI_CHAR};
+	myinterface = malloc(sizeof(struct starpu_my_interface));
+
+	MPI_Address(myinterface, displacements);
+	MPI_Address(&myinterface[0].c, displacements+1);
+	displacements[1] -= displacements[0];
+	displacements[0] = 0;
+
+	ret = MPI_Type_create_struct(2, blocklengths, displacements, types, mpi_datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
+
+	ret = MPI_Type_commit(mpi_datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+}
+
+void starpu_my_interface_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype)
+{
+	(void)handle;
+	_starpu_my_interface_datatype_allocate(mpi_datatype);
+}
+
+void starpu_my_interface_datatype_free(MPI_Datatype *mpi_datatype)
+{
+	MPI_Type_free(mpi_datatype);
+	free(myinterface);
+}
+
+int starpu_my_interface_get_int(starpu_data_handle_t handle)
+{
+	struct starpu_my_interface *my_interface =
+		(struct starpu_my_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	return my_interface->d;
+}
+
+char starpu_my_interface_get_char(starpu_data_handle_t handle)
+{
+	struct starpu_my_interface *my_interface =
+		(struct starpu_my_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	return my_interface->c;
+}
+
+static void data_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
+{
+	struct starpu_my_interface *my_interface = (struct starpu_my_interface *) data_interface;
+
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		struct starpu_my_interface *local_interface = (struct starpu_my_interface *)
+			starpu_data_get_interface_on_node(handle, node);
+
+		if (node == home_node)
+		{
+			local_interface->d = my_interface->d;
+			local_interface->c = my_interface->c;
+		}
+		else
+		{
+			local_interface->d = 0;
+			local_interface->c = 0;
+		}
+	}
+}
+
+static starpu_ssize_t data_allocate_data_on_node(void *data_interface, unsigned node)
+{
+	(void)data_interface;
+	(void)node;
+	return 0;
+}
+
+static void data_free_data_on_node(void *data_interface, unsigned node)
+{
+	(void)data_interface;
+	(void)node;
+}
+
+static size_t data_get_size(starpu_data_handle_t handle)
+{
+	(void)handle;
+	return sizeof(int) + sizeof(char);
+}
+
+static uint32_t data_footprint(starpu_data_handle_t handle)
+{
+	return starpu_hash_crc32c_be(starpu_my_interface_get_int(handle), 0);
+}
+
+static int data_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
+{
+	(void)handle;
+	(void)node;
+	(void)ptr;
+	(void)count;
+	STARPU_ASSERT_MSG(0, "The data interface has been registered with starpu_mpi_datatype_register(). Calling the pack_data function should not happen\n");
+	return 0;
+}
+
+static int data_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	(void)handle;
+	(void)node;
+	(void)ptr;
+	(void)count;
+	STARPU_ASSERT_MSG(0, "The data interface has been registered with starpu_mpi_datatype_register(). Calling the unpack_data function should not happen\n");
+	return 0;
+}
+
+static starpu_ssize_t data_describe(void *data_interface, char *buf, size_t size)
+{
+	struct starpu_my_interface *my_interface = (struct starpu_my_interface *) data_interface;
+	return snprintf(buf, size, "Data%d-%c", my_interface->d, my_interface->c);
+}
+
+static void *data_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_my_interface *my_interface = (struct starpu_my_interface *) starpu_data_get_interface_on_node(handle, node);
+
+	return (void*) &my_interface->d;
+}
+
+static int copy_any_to_any(void *src_interface, unsigned src_node,
+			   void *dst_interface, unsigned dst_node,
+			   void *async_data)
+{
+	struct starpu_my_interface *src_data = src_interface;
+	struct starpu_my_interface *dst_data = dst_interface;
+	int ret = 0;
+
+	if (starpu_interface_copy((uintptr_t) src_data->d, 0, src_node,
+				  (uintptr_t) dst_data->d, 0, dst_node,
+				  sizeof(src_data->d), async_data))
+		ret = -EAGAIN;
+	if (starpu_interface_copy((uintptr_t) src_data->c, 0, src_node,
+				  (uintptr_t) dst_data->c, 0, dst_node,
+				  sizeof(src_data->c),
+				  async_data))
+		ret = -EAGAIN;
+	return ret;
+}
+
+static const struct starpu_data_copy_methods data_copy_methods =
+{
+	.any_to_any = copy_any_to_any
+};
+
+static struct starpu_data_interface_ops interface_data_ops =
+{
+	.register_data_handle = data_register_data_handle,
+	.allocate_data_on_node = data_allocate_data_on_node,
+	.free_data_on_node = data_free_data_on_node,
+	.copy_methods = &data_copy_methods,
+	.get_size = data_get_size,
+	.footprint = data_footprint,
+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
+	.interface_size = sizeof(struct starpu_my_interface),
+	.handle_to_pointer = data_handle_to_pointer,
+	.pack_data = data_pack_data,
+	.unpack_data = data_unpack_data,
+	.describe = data_describe
+};
+
+void starpu_my_interface_data_register(starpu_data_handle_t *handleptr, unsigned home_node, struct starpu_my_interface *xc)
+{
+	if (interface_data_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+	{
+		interface_data_ops.interfaceid = starpu_data_interface_get_next_id();
+	}
+
+	starpu_data_register(handleptr, home_node, xc, &interface_data_ops);
+}

+ 62 - 0
nmad/examples/user_datatype/my_interface.h

@@ -0,0 +1,62 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <mpi.h>
+
+#ifndef __DATA_INTERFACE_H
+#define __DATA_INTERFACE_H
+
+struct starpu_my_interface
+{
+	int d;
+	char c;
+};
+
+void starpu_my_interface_data_register(starpu_data_handle_t *handle, unsigned home_node, struct starpu_my_interface *xc);
+
+char starpu_my_interface_get_char(starpu_data_handle_t handle);
+int starpu_my_interface_get_int(starpu_data_handle_t handle);
+
+#define STARPU_MY_INTERFACE_GET_CHAR(interface)	(((struct starpu_my_interface *)(interface))->c)
+#define STARPU_MY_INTERFACE_GET_INT(interface)	(((struct starpu_my_interface *)(interface))->d)
+
+void _starpu_my_interface_datatype_allocate(MPI_Datatype *mpi_datatype);
+void starpu_my_interface_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype);
+void starpu_my_interface_datatype_free(MPI_Datatype *mpi_datatype);
+
+void starpu_my_interface_display_codelet_cpu(void *descr[], void *_args);
+void starpu_my_interface_compare_codelet_cpu(void *descr[], void *_args);
+
+static struct starpu_codelet starpu_my_interface_display_codelet =
+{
+	.cpu_funcs = {starpu_my_interface_display_codelet_cpu},
+	.cpu_funcs_name = {"starpu_my_interface_display_codelet_cpu"},
+	.nbuffers = 1,
+	.modes = {STARPU_R},
+	.name = "starpu_my_interface_display_codelet"
+};
+
+static struct starpu_codelet starpu_my_interface_compare_codelet =
+{
+	.cpu_funcs = {starpu_my_interface_compare_codelet_cpu},
+	.cpu_funcs_name = {"starpu_my_interface_compare_codelet_cpu"},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_R},
+	.name = "starpu_my_interface_compare_codelet"
+};
+
+#endif /* __MY_INTERFACE_H */

+ 113 - 0
nmad/examples/user_datatype/user_datatype.c

@@ -0,0 +1,113 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015, 2016, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "my_interface.h"
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+int main(int argc, char **argv)
+{
+	int rank, nodes;
+	int ret=0;
+	int compare=0;
+
+	struct starpu_my_interface my1 = {.d = 98 , .c = 'z'};
+	struct starpu_my_interface my0 = {.d = 42 , .c = 'n'};
+
+	starpu_data_handle_t handle0;
+	starpu_data_handle_t handle1;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
+
+	if (nodes < 2 || (starpu_cpu_worker_get_count() == 0))
+	{
+		if (rank == 0)
+		{
+			if (nodes < 2)
+				fprintf(stderr, "We need at least 2 processes.\n");
+			else
+				fprintf(stderr, "We need at least 1 CPU.\n");
+		}
+		starpu_mpi_shutdown();
+		starpu_shutdown();
+		return 77;
+	}
+
+	if (rank == 1)
+	{
+		my0.d = 0;
+		my0.c = 'z';
+	}
+	starpu_my_interface_data_register(&handle0, STARPU_MAIN_RAM, &my0);
+	starpu_my_interface_data_register(&handle1, -1, &my1);
+	starpu_mpi_datatype_register(handle1, starpu_my_interface_datatype_allocate, starpu_my_interface_datatype_free);
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	if (rank == 0)
+	{
+		MPI_Datatype mpi_datatype;
+		_starpu_my_interface_datatype_allocate(&mpi_datatype);
+		MPI_Send(&my0, 1, mpi_datatype, 1, 42, MPI_COMM_WORLD);
+		starpu_my_interface_datatype_free(&mpi_datatype);
+	}
+	else if (rank == 1)
+	{
+		MPI_Datatype mpi_datatype;
+		MPI_Status status;
+		_starpu_my_interface_datatype_allocate(&mpi_datatype);
+		MPI_Recv(&my0, 1, mpi_datatype, 0, 42, MPI_COMM_WORLD, &status);
+		FPRINTF(stderr, "Received value: '%c' %d\n", my0.c, my0.d);
+		starpu_my_interface_datatype_free(&mpi_datatype);
+	}
+
+	if (rank == 0)
+	{
+		int *compare_ptr = &compare;
+
+		starpu_task_insert(&starpu_my_interface_display_codelet, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle0, 0);
+		starpu_mpi_isend_detached(handle0, 1, 10, MPI_COMM_WORLD, NULL, NULL);
+		starpu_mpi_irecv_detached(handle1, 1, 20, MPI_COMM_WORLD, NULL, NULL);
+
+		starpu_task_insert(&starpu_my_interface_display_codelet, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle1, 0);
+		starpu_task_insert(&starpu_my_interface_compare_codelet, STARPU_R, handle0, STARPU_R, handle1, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
+	}
+	else if (rank == 1)
+	{
+		starpu_task_insert(&starpu_my_interface_display_codelet, STARPU_VALUE, "node1 initial value", strlen("node1 initial value")+1, STARPU_R, handle0, 0);
+		starpu_mpi_irecv_detached(handle0, 0, 10, MPI_COMM_WORLD, NULL, NULL);
+		starpu_task_insert(&starpu_my_interface_display_codelet, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle0, 0);
+		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+
+	starpu_mpi_datatype_unregister(handle0);
+	starpu_data_unregister(handle0);
+	starpu_data_unregister(handle1);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return (rank == 0) ? !compare : 0;
+}