Browse Source

merge trunk

Nathalie Furmento 8 years ago
parent
commit
5cc8decc7b
100 changed files with 2665 additions and 812 deletions
  1. 1 0
      AUTHORS
  2. 13 0
      ChangeLog
  3. 22 1
      Makefile.am
  4. 1 0
      autogen.sh
  5. 212 70
      configure.ac
  6. 43 5
      doc/doxygen/Makefile.am
  7. 7 3
      doc/doxygen/chapters/210_check_list_performance.doxy
  8. 23 8
      doc/doxygen/chapters/310_data_management.doxy
  9. 12 2
      doc/doxygen/chapters/320_scheduling.doxy
  10. 12 0
      doc/doxygen/chapters/390_faq.doxy
  11. 3 1
      doc/doxygen/chapters/401_out_of_core.doxy
  12. 136 20
      doc/doxygen/chapters/410_mpi_support.doxy
  13. 48 2
      doc/doxygen/chapters/501_environment_variables.doxy
  14. 41 0
      doc/doxygen/chapters/510_configure_options.doxy
  15. 11 0
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  16. 85 12
      doc/doxygen/chapters/api/data_interfaces.doxy
  17. 5 1
      doc/doxygen/chapters/api/data_management.doxy
  18. 7 0
      doc/doxygen/chapters/api/data_out_of_core.doxy
  19. 28 5
      doc/doxygen/chapters/api/data_partition.doxy
  20. 4 0
      doc/doxygen/chapters/api/initialization.doxy
  21. 5 2
      doc/doxygen/chapters/api/modularized_scheduler.doxy
  22. 41 0
      doc/doxygen/chapters/api/mpi.doxy
  23. 4 0
      doc/doxygen/chapters/api/performance_model.doxy
  24. 9 5
      doc/doxygen/chapters/api/scheduling_policy.doxy
  25. 18 1
      doc/doxygen/chapters/api/workers.doxy
  26. 6 7
      doc/doxygen/chapters/code/disk_compute.c
  27. 10 2
      examples/Makefile.am
  28. 10 8
      examples/audio/starpu_audio_processing.c
  29. 2 4
      examples/basic_examples/variable.c
  30. 7 0
      examples/cpp/add_vectors.cpp
  31. 7 1
      examples/cpp/add_vectors_cpp11.cpp
  32. 656 0
      examples/cpp/add_vectors_interface.cpp
  33. 8 8
      examples/heat/dw_factolu_kernels.c
  34. 21 10
      examples/heat/dw_sparse_cg.c
  35. 1 1
      examples/lu/lu.sh
  36. 3 0
      examples/lu/lu_example.c
  37. 20 17
      examples/mlr/mlr.c
  38. 3 3
      examples/ppm_downscaler/yuv_downscaler.c
  39. 5 1
      examples/sched_ctx/parallel_tasks_with_cluster_api.c
  40. 13 13
      examples/sched_ctx_utils/sched_ctx_utils.c
  41. 1 1
      examples/scheduler/schedulers.sh
  42. 1 1
      examples/scheduler/schedulers_context.sh
  43. 2 2
      examples/spmv/dw_block_spmv.c
  44. 2 2
      examples/spmv/matrix_market/mmio.c
  45. 2 2
      examples/stencil/implicit-stencil.c
  46. 2 2
      examples/stencil/stencil.c
  47. 11 0
      include/fstarpu_mod.f90
  48. 2 2
      include/pthread_win32/pthread.h
  49. 1 1
      include/schedulers/starpu_heteroprio.h
  50. 2 2
      include/starpu.h
  51. 3 1
      include/starpu_config.h.in
  52. 5 1
      include/starpu_data.h
  53. 2 1
      include/starpu_data_filters.h
  54. 4 0
      include/starpu_data_interfaces.h
  55. 5 4
      include/starpu_disk.h
  56. 2 2
      include/starpu_mpi_ms.h
  57. 2 0
      include/starpu_perfmodel.h
  58. 4 4
      include/starpu_sched_component.h
  59. 0 3
      include/starpu_scheduler.h
  60. 13 6
      include/starpu_task.h
  61. 11 1
      include/starpu_thread_util.h
  62. 35 4
      include/starpu_util.h
  63. 8 0
      include/starpu_worker.h
  64. 1 29
      mpi/examples/Makefile.am
  65. 2 2
      mpi/examples/complex/mpi_complex.c
  66. 3 2
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  67. 1 1
      mpi/examples/mpi_lu/pxlu.c
  68. 3 2
      mpi/examples/mpi_lu/pxlu.h
  69. 15 15
      mpi/examples/user_datatype/my_interface.c
  70. 2 2
      mpi/examples/user_datatype/user_datatype.c
  71. 105 0
      mpi/include/fstarpu_mpi_mod.f90
  72. 11 0
      mpi/include/starpu_mpi.h
  73. 2 2
      mpi/src/load_balancer/policy/load_data_interface.c
  74. 5 5
      mpi/src/load_balancer/policy/load_heat_propagation.c
  75. 175 49
      mpi/src/starpu_mpi.c
  76. 41 18
      mpi/src/starpu_mpi_cache.c
  77. 6 3
      mpi/src/starpu_mpi_cache_stats.c
  78. 4 1
      mpi/src/starpu_mpi_comm.c
  79. 17 29
      mpi/src/starpu_mpi_datatype.c
  80. 7 7
      mpi/src/starpu_mpi_early_data.c
  81. 7 7
      mpi/src/starpu_mpi_early_request.c
  82. 35 2
      mpi/src/starpu_mpi_fortran.c
  83. 24 0
      mpi/src/starpu_mpi_fxt.h
  84. 19 6
      mpi/src/starpu_mpi_helper.c
  85. 7 2
      mpi/src/starpu_mpi_private.h
  86. 3 2
      mpi/src/starpu_mpi_select_node.c
  87. 10 5
      mpi/src/starpu_mpi_stats.c
  88. 9 9
      mpi/src/starpu_mpi_sync_data.c
  89. 4 1
      mpi/src/starpu_mpi_tag.c
  90. 78 38
      mpi/src/starpu_mpi_task_insert.c
  91. 2 2
      mpi/src/starpu_mpi_task_insert.h
  92. 35 18
      mpi/src/starpu_mpi_task_insert_fortran.c
  93. 4 107
      mpi/tests/Makefile.am
  94. 1 1
      mpi/tests/block_interface.c
  95. 2 2
      mpi/tests/block_interface_pinned.c
  96. 2 1
      mpi/tests/cache.c
  97. 2 1
      mpi/tests/cache_disable.c
  98. 11 6
      mpi/tests/callback.c
  99. 327 180
      mpi/tests/datatypes.c
  100. 0 0
      mpi/tests/insert_task_compute.c

+ 1 - 0
AUTHORS

@@ -17,6 +17,7 @@ David Gómez <david_gomez1380@yahoo.com.mx>
 Sylvain Henry <sylvain.henry@inria.fr>
 Andra Hugo <andra.hugo@inria.fr>
 Mehdi Juhoor <mjuhoor@gmail.com>
+Erwan Leria <erwan.leria@etu.u-bordeaux.fr>
 Xavier Lacoste <xavier.lacoste@inria.fr>
 Benoît Lizé <benoit.lize@gmail.com>
 Antoine Lucas <antoine.lucas.33@gmail.com>

+ 13 - 0
ChangeLog

@@ -27,6 +27,7 @@ New features:
     --enable-mpi-master-slave option to activate it.
   * Add STARPU_CUDA_THREAD_PER_DEV environment variable to support driving all
     GPUs from only one thread when almost all kernels are asynchronous.
+  * Add starpu_replay tool to replay tasks.rec files with Simgrid.
 
 Small features:
   * Scheduling contexts may now be associated a user data pointer at creation
@@ -46,14 +47,21 @@ Small features:
   * Add a field starpu_task::where similar to starpu_codelet::where
     which allows to restrict where to execute a task. Also add
     STARPU_TASK_WHERE to be used when calling starpu_task_insert().
+  * Add SubmitOrder trace field.
+  * Add workerids and workerids_len task fields.
+  * Add priority management to StarPU-MPI.
+  * Add STARPU_MAIN_THREAD_CPUID and STARPU_MPI_THREAD_CPUID environment
+    variables.
 
 Changes:
   * Vastly improve simgrid simulation time.
+  * Switch default scheduler to lws.
 
 Small changes:
   * Use asynchronous transfers for task data fetches with were not prefetched.
   * Allow to call starpu_sched_ctx_set_policy_data on the main
     scheduler context
+  * Fonction starpu_is_initialized() is moved to the public API.
 
 StarPU 1.2.2 (svn revision xxx)
 ==============================================
@@ -70,6 +78,7 @@ New features:
     STARPU_TASK_BREAK_ON_EXEC environment variables, with the job_id
     of a task. StarPU will raise SIGTRAP when the task is being
     scheduled, pushed, or popped by the scheduler.
+  * Add per-node MPI data.
 
 Small features:
   * New function starpu_worker_get_job_id(struct starpu_task *task)
@@ -78,6 +87,10 @@ Small features:
   * MPI: Add mpi communications in dag.dot
   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CPU environment variable to
     allow having one perfmodel per CPU core
+  * Add starpu_vector_filter_list_long filter.
+  * Add starpu_perfmodel_arch_comb_fetch function.
+  * Add STARPU_WATCHDOG_DELAY environment variable.
+  * Add starpu_mpi_get_data_on_all_nodes_detached function.
 
 Small changes:
   * Output generated through STARPU_MPI_COMM has been modified to

+ 22 - 1
Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2017  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
+# Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
 # Copyright (C) 2014  INRIA
 # Copyright (C) 2016  Inria
 #
@@ -43,6 +43,10 @@ if USE_NMAD
 SUBDIRS += nmad
 endif
 
+if USE_DSM
+SUBDIRS += dsm
+endif
+
 if BUILD_EXAMPLES
 SUBDIRS += examples
 endif
@@ -142,6 +146,16 @@ if STARPU_DEVEL
 		echo "Please do not use getenv, use starpu_getenv instead, which catches unsafe uses"; \
 		false ; \
 	fi
+# we count the number of files which include unistd.h
+# we count the number of files which properly include unistd.h i.e by first detecting if it's available
+# and then we check both numbers are the same ...a
+	@UNISTD_ALL_LINES=$(shell grep -B1 -rs "^#include <unistd.h>" $(srcdir)/src/ $(srcdir)/include/ $(srcdir)/mpi/src $(srcdir)/mpi/include  |grep -v dolib|grep -v -e "--" | tr '\012' '@' | sed 's/unistd.h>@/unistd.h>\n/g' | wc -l) ;\
+	UNISTD_CORRECT_LINES=$(shell grep -B1 -rs "^#include <unistd.h>" $(srcdir)/src/ $(srcdir)/include/ $(srcdir)/mpi/src $(srcdir)/mpi/include  |grep -v dolib|grep -v -e "--" | tr '\012' '@' | sed 's/unistd.h>@/unistd.h>\n/g' | grep '#ifdef .*HAVE_UNISTD_H.*:#include <unistd.h>' | wc -l) ;\
+	if test $$UNISTD_ALL_LINES -ne $$UNISTD_CORRECT_LINES ; \
+	then \
+		echo "Please do not unconditionally include unistd.h, it is not available on Windows, include config.h and test for HAVE_UNISTD_H" ; \
+		false ; \
+	fi
 endif
 
 if BUILD_STARPU_TOP
@@ -180,6 +194,13 @@ DISTCLEANFILES = STARPU-REVISION
 
 include starpu-top/extradist
 
+recheck:
+	RET=0 ; \
+	for i in $(SUBDIRS) ; do \
+		make -C $$i recheck || RET=1 ; \
+	done ; \
+	exit $$RET
+
 showcheck:
 	RET=0 ; \
 	for i in $(SUBDIRS) ; do \

+ 1 - 0
autogen.sh

@@ -28,5 +28,6 @@ then
 		export LIBTOOLIZE=glibtoolize
 	fi
 fi
+mkdir -p dsm
 autoreconf -ivf -I m4
 

+ 212 - 70
configure.ac

@@ -440,6 +440,38 @@ AM_CONDITIONAL(USE_NMAD, test x$build_nmad_lib = xyes)
 
 ###############################################################################
 #                                                                             #
+#                                    DSM                                      #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(dsm, [AS_HELP_STRING([--enable-dsm],
+                              [Enable StarPU DSM library generation])],
+            [enable_dsm=$enableval],
+            [enable_dsm=no])
+
+if test x$enable_dsm = xyes; then
+        # rule out StarPU DSM + StarPU MPI for now
+        if test x$build_mpi_lib = xyes; then
+                AC_MSG_ERROR(StarPU DSM library generation is incompatible with StarPU MPI lib generation)
+        fi
+fi
+
+dsm_included=no
+sinclude(dsm/config-inc.ac)
+if test x$dsm_included != xyes; then
+        if test x$enable_dsm = xyes; then
+                AC_MSG_ERROR(StarPU DSM library generation needs DSM source in STARPU/dsm subdirectory)
+        fi
+	mkdir -p dsm
+	echo "dist:" > ./dsm/Makefile
+	echo "distdir:" >> ./dsm/Makefile
+        enable_dsm=no
+fi
+
+AM_CONDITIONAL(USE_DSM, [test x$enable_dsm != xno])
+
+###############################################################################
+#                                                                             #
 #                                LIBTOOLS                                     #
 #                                                                             #
 ###############################################################################
@@ -604,6 +636,23 @@ AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
 
 ###############################################################################
 
+###############################################################################
+#                                                                             #
+#                           NUMA memory nodes                                 #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(maximum number of NUMA nodes)
+AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
+			[maximum number of NUMA nodes])],
+			nmaxnumanodes=$enableval, nmaxnumanodes=2)
+AC_MSG_RESULT($nmaxnumanodes)
+AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
+		[maximum number of NUMA nodes])
+
+
+###############################################################################
+
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
@@ -617,6 +666,10 @@ case "$target" in
   starpu_linux=yes
   AC_DEFINE(STARPU_LINUX_SYS, [1], [Define to 1 on Linux])
   ;;
+*-*-openbsd*)
+  starpu_openbsd=yes
+  AC_DEFINE(STARPU_OPENBSD_SYS, [1], [Define to 1 on OpenBSD systems])
+  ;;
 *-*darwin*)
   starpu_darwin=yes
   AC_DEFINE(STARPU_HAVE_DARWIN, [1], [Define this on darwin.])
@@ -625,6 +678,7 @@ esac
 AM_CONDITIONAL([STARPU_HAVE_WINDOWS], [test "x$starpu_windows" = "xyes"])
 AM_CONDITIONAL([STARPU_LINUX_SYS], [test "x$starpu_linux" = "xyes"])
 AM_CONDITIONAL([STARPU_HAVE_DARWIN], [test "x$starpu_darwin" = "xyes"])
+AM_CONDITIONAL([STARPU_OPENBSD_SYS], [test "x$starpu_openbsd" = "xyes"])
 
 # on Darwin, GCC targets i386 by default, so we don't have atomic ops
 AC_CHECK_SIZEOF([void *])
@@ -795,15 +849,80 @@ fi
 AC_CHECK_FUNC([sched_yield], [AC_DEFINE([STARPU_HAVE_SCHED_YIELD], [1], [Define to 1 if the function sched_yield is available.])])
 
 AC_CHECK_HEADERS([aio.h])
-AC_CHECK_HEADERS([libaio.h])
 AC_CHECK_LIB([rt], [aio_read])
-AC_CHECK_LIB([aio], [io_setup])
+#AC_CHECK_HEADERS([libaio.h])
+#AC_CHECK_LIB([aio], [io_setup])
 
 AC_CHECK_FUNCS([mkostemp])
 AC_CHECK_FUNCS([mkdtemp])
 
 AC_CHECK_FUNCS([pread pwrite])
 
+AC_ARG_ENABLE(hdf5, [AS_HELP_STRING([--disable-hdf5], [disable HDF5 support])],
+                    enable_hdf5=$enableval, enable_hdf5=maybe)
+
+if test "x$enable_hdf5" != xno
+then
+	AC_ARG_WITH(hdf5-include-dir,
+		[AS_HELP_STRING([--with-hdf5-include-dir=<path>],
+		[specify where HDF5 headers are installed])],
+		[
+			hdf5_include_dir="$withval"
+		], [hdf5_include_dir=""])
+
+	hdf5_inc_dir="/usr/include/hdf5 /usr/include/hdf5/serial ${hdf5_include_dir}"
+
+	enable_include_hdf5=no
+	for f in $hdf5_inc_dir; do
+		if test -n "$f" ; then
+			SAVED_CFLAGS="${CFLAGS}"
+			CFLAGS=-I${f}
+			AC_CHECK_HEADERS([hdf5.h])
+			if test "$ac_cv_header_hdf5_h" = "yes" ; then
+				CFLAGS="-I${f} ${SAVED_CFLAGS}"
+				enable_include_hdf5=yes
+				break
+			else
+				CFLAGS=${SAVED_CFLAGS}
+			fi
+			unset ac_cv_header_hdf5_h
+		fi
+	done
+
+
+	AC_ARG_WITH(hdf5-lib-dir,
+		[AS_HELP_STRING([--with-hdf5-lib-dir=<path>],
+		[specify where HDF5 libraries are installed])],
+		[
+			hdf5_libraries_dir="$withval"
+		], [hdf5_libraries_dir=""])
+
+	hdf5_lib_dir="/usr/lib/x86_64-linux-gnu/hdf5 /usr/lib/x86_64-linux-gnu/hdf5/serial ${hdf5_libraries_dir}"
+
+	enable_libraries_hdf5=no
+	for f in $hdf5_lib_dir; do
+		if test -n "$f" ; then
+			SAVED_LDFLAGS="${LDFLAGS}"
+			LDFLAGS=-L${f}
+			STARPU_HAVE_LIBRARY(HDF5, [hdf5])
+			if test "$ac_cv_lib_hdf5_main" = "yes" ; then
+				LDFLAGS="-L${f} ${SAVED_LDFLAGS} ${STARPU_HDF5_LDFLAGS}"
+				enable_libraries_hdf5=yes
+				break
+			else
+				LDFLAGS=${SAVED_LDFLAGS}
+			fi
+			unset ac_cv_lib_hdf5_main
+		fi
+	done
+fi
+
+if test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno"; then
+        AC_DEFINE([STARPU_HAVE_HDF5], [1], [Define to 1 if you have the <hdf5.h> header file.])
+fi
+AM_CONDITIONAL(STARPU_HAVE_HDF5, test  "x$enable_libraries_hdf5" = "xyes" -a "x$enable_include_hdf5" = "xyes" -a "x$enable_hdf5" != "xno")
+
+
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
 
@@ -1851,13 +1970,13 @@ else
 	CXXFLAGS="-O3 $CXXFLAGS"
 fi
 if test x$GCC = xyes; then
-	CFLAGS+=" -gdwarf-2"
-	CXXFLAGS+=" -gdwarf-2"
-	LDFLAGS+=" -gdwarf-2"
+	CFLAGS+=" -gdwarf-2 -g3"
+	CXXFLAGS+=" -gdwarf-2 -g3"
+	LDFLAGS+=" -gdwarf-2 -g3"
 fi
-CFLAGS+=" -g3 "
-CXXFLAGS+=" -g3 "
-LDFLAGS+=" -g3 "
+CFLAGS+=" -g "
+CXXFLAGS+=" -g "
+LDFLAGS+=" -g "
 
 if test x$enable_spinlock_check = xyes; then
 	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
@@ -1984,15 +2103,19 @@ if test x$use_fxt = xyes; then
 	##########################################
 	# Poti is a library to generate paje trace files
 	##########################################
-	PKG_CHECK_MODULES([POTI], [poti], [
-		AC_DEFINE(STARPU_HAVE_POTI, [1], [Define to 1 if you have libpoti])
+	PKG_CHECK_MODULES([POTI], [poti], [have_valid_poti=yes], [have_valid_poti=no])
+	AC_ARG_ENABLE(poti, [AS_HELP_STRING([--enable-poti],
+				[Enable the use of the POTI library to generate Paje traces])],
+				enable_poti=$enableval, enable_poti=no)
+	if test x$enable_poti = xyes -a x$have_valid_poti = xyes ; then
+		AC_DEFINE(STARPU_HAVE_POTI, [1], [Define to 1 if you have libpoti and it is meant to be used])
 		save_LIBS="$LIBS"
 		LIBS="$LIBS $POTI_LIBS"
 		AC_CHECK_FUNCS([poti_init_custom])
 		LIBS="$save_LIBS"
-	], [:])
-	FXT_CFLAGS="$FXT_CFLAGS $POTI_CFLAGS"
-	FXT_LIBS="$FXT_LIBS $POTI_LIBS"
+		FXT_CFLAGS="$FXT_CFLAGS $POTI_CFLAGS"
+		FXT_LIBS="$FXT_LIBS $POTI_LIBS"
+	fi
 fi
 
 AC_MSG_CHECKING(whether additional locking systems FxT traces should be enabled)
@@ -2116,16 +2239,16 @@ if test x$maxnodes = x0 ; then
 	else
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# and per MIC device
-		# we add nodes to use 3 memory disks
-		nodes=4
+		# we add nodes to use 2 memory disks
+		nodes=`expr $nmaxnumanodes + 2`
 		if test x$enable_cuda = xyes ; then
-			# we could have used nmaxcudadev + 1, but this would certainly give an
-			# odd number.
+			# we could have used nmaxcudadev + 1, but this would certainly give an
+			# odd number.
 			nodes=`expr $nodes + $nmaxcudadev`
 		fi
 		if test x$enable_opencl = xyes ; then
-			# we could have used nmaxcudadev + 1, but this would certainly give an
-			# odd number.
+			# we could have used nmaxcudadev + 1, but this would certainly give an
+			# odd number.
 			nodes=`expr $nodes + $nmaxopencldev`
 		fi
 		if test x$enable_mic = xyes ; then
@@ -2317,40 +2440,45 @@ if test "x$enable_starpu_top" != "xno" ; then
 	AC_PATH_PROGS([QMAKE], [qmake-qt4 qmake], [not-found])
 	if test x$QMAKE != xnot-found; then
 		QMAKE_VERSION=`$QMAKE --version 2>&1 | head -n 1 | cut -d '.' -f 1 | cut -d ' ' -f 3`
-		if test $QMAKE_VERSION -ge 2 ; then
-			PKG_CHECK_EXISTS([QtGui QtNetwork QtOpenGL QtSql], [
+		QT_VERSION=`$QMAKE --version 2>&1 | tail -n 1 | cut -d '.' -f 1 | cut -d ' ' -f 4`
+		if test $QT_VERSION -ge 5 ; then
+			can_build_starpu_top=yes
+		elif test $QMAKE_VERSION -ge 2 ; then
+			PKG_CHECK_EXISTS([QtGui QtOpenGL QtSql], [
 				QT_MAJVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 1`
 				QT_MINVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 2`
 				if test $QT_MAJVERSION -gt 4 -o \( $QT_MAJVERSION -eq 4 -a $QT_MINVERSION -ge 7 \) ; then
 					can_build_starpu_top=yes
 				fi
-				QWT_PRI=embed
-				AC_ARG_WITH(qwt-include-dir,
-					[AS_HELP_STRING([--with-qwt-include-dir=<path>],
-					[specify installed libqwt include path])],
-					[
-						STARPU_QWT_INCLUDE="$withval"
-						AC_SUBST(STARPU_QWT_INCLUDE)
-						QWT_PRI=system
-					])
-				AC_ARG_WITH(qwt-lib-dir,
-					[AS_HELP_STRING([--with-qwt-lib-dir=<path>],
-					[specify installed libqwt library path])],
-					[
-						STARPU_QWT_LDFLAGS="-L$withval"
-						QWT_PRI=system
-					])
-				AC_ARG_WITH(qwt-lib,
-					[AS_HELP_STRING([--with-qwt-lib=<name>],
-					[specify installed libqwt library name])],
-					[
-						STARPU_QWT_LDFLAGS="${STARPU_QWT_LDFLAGS} -l$withval"
-						QWT_PRI=system
-					])
-				AC_SUBST(STARPU_QWT_LDFLAGS)
-				AC_SUBST(QWT_PRI)
 			])
 		fi
+		if test x$can_build_starpu_top = xyes; then
+			QWT_PRI=embed
+			AC_ARG_WITH(qwt-include-dir,
+				[AS_HELP_STRING([--with-qwt-include-dir=<path>],
+				[specify installed libqwt include path])],
+				[
+					STARPU_QWT_INCLUDE="$withval"
+					AC_SUBST(STARPU_QWT_INCLUDE)
+					QWT_PRI=system
+				])
+			AC_ARG_WITH(qwt-lib-dir,
+				[AS_HELP_STRING([--with-qwt-lib-dir=<path>],
+				[specify installed libqwt library path])],
+				[
+					STARPU_QWT_LDFLAGS="-L$withval"
+					QWT_PRI=system
+				])
+			AC_ARG_WITH(qwt-lib,
+				[AS_HELP_STRING([--with-qwt-lib=<name>],
+				[specify installed libqwt library name])],
+				[
+					STARPU_QWT_LDFLAGS="${STARPU_QWT_LDFLAGS} -l$withval"
+					QWT_PRI=system
+				])
+			AC_SUBST(STARPU_QWT_LDFLAGS)
+			AC_SUBST(QWT_PRI)
+		fi
 	fi
 fi
 
@@ -2439,18 +2567,25 @@ AC_SUBST([pkglibdir])
 
 AC_ARG_ENABLE(fortran, [AS_HELP_STRING([--disable-fortran],
 			[disable build of fortran examples])],
-			enable_build_fortran=$enableval, enable_build_fortran=yes)
+			enable_build_fortran_requested=$enableval, enable_build_fortran_requested=yes)
 use_mpi_fort=no
-if test "x$FC" != "x"; then
-	if $FC --version|grep -q 'GNU Fortran'; then
-		gfortran_fc_version=`$FC --version|head -1|sed 's/.*)//;s/^.*\([[0-9]][[0-9]]*\)\.\([[0-9]][[0-9]]*\)\.\([[0-9]][[0-9]]*\).*/\1.\2.\3/'`
-		gfortran_maj_version=`echo $gfortran_fc_version|cut -d. -f1`
-		gfortran_min_version=`echo $gfortran_fc_version|cut -d. -f2`
-
-		if test $gfortran_maj_version -lt 4 -o \( $gfortran_maj_version -eq 4 -a $gfortran_min_version -lt 9 \) ; then
-			AC_MSG_WARN([GFortran $gfortran_fc_version too old, version >= 4.9.x needed, Fortran examples will not be built])
-			enable_build_fortran="no"
-		fi
+enable_build_fortran=no
+if test "x$enable_build_fortran_requested" = "xyes" ; then
+   if test "x$FC" != "x"; then
+   	if $FC --version|grep -q 'GNU Fortran'; then
+		 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+     	         #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
+                 int dummy;
+                 #else
+                 #error GFortran too old, version >= 4.9.x needed, Fortran examples will not be built
+                 #endif
+                 ]],
+                 )],
+                 [enable_build_fortran="yes"],
+                 [enable_build_fortran="no"])
+                 if test "$enable_build_fortran" = "no" ; then
+                   AC_MSG_WARN([GFortran too old, version >= 4.9.x needed, Fortran examples will not be built])
+                 fi
 	else
 		if $FC -V 2>&1|grep -q 'Intel(R) Fortran'; then
 			ifort_fc_version=`$FC -V 2>&1 |head -1|sed 's/.*Version //;s/ Build.*//'`
@@ -2520,6 +2655,7 @@ if test "x$FC" != "x"; then
 			fi
 		fi
 	fi
+   fi
 fi
 if test "x$enable_build_fortran" = "xyes" ; then
    if test "x$FC" = "x" ; then
@@ -2916,7 +3052,9 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 			else
 				AC_MSG_RESULT(no)
 				AC_MSG_CHECKING(min-dgels source)
-				cp -r $srcdir/min-dgels $PWD/
+				if test ! -d $PWD/min-dgels; then
+					cp -r $srcdir/min-dgels $PWD/
+				fi
 				AC_MSG_RESULT(yes)
 				DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/*.a -Wl,--end-group"
 				AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
@@ -3198,6 +3336,8 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   mkdir -p tests/datawizard
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
+  mkdir -p tests/overlap
+  test -e tests/overlap/overlap.sh || ln -sf $ac_abs_top_srcdir/tests/overlap/overlap.sh tests/overlap/
   mkdir -p tests/model-checking
   test -e tests/model-checking/prio_list.sh || ln -sf $ac_abs_top_srcdir/tests/model-checking/prio_list.sh tests/model-checking/
   test -e tests/model-checking/barrier.sh || ln -sf $ac_abs_top_srcdir/tests/model-checking/barrier.sh tests/model-checking/
@@ -3205,6 +3345,10 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e examples/heat/heat.sh || ln -sf $ac_abs_top_srcdir/examples/heat/heat.sh examples/heat/
   mkdir -p examples/lu
   test -e examples/lu/lu.sh || ln -sf $ac_abs_top_srcdir/examples/lu/lu.sh examples/lu/
+  test -e tools/starpu_paje_draw_histogram.R || ln -sf $ac_abs_top_srcdir/tools/starpu_paje_draw_histogram.R tools/starpu_paje_draw_histogram.R
+  test -e tools/starpu_paje_state_stats.R || ln -sf $ac_abs_top_srcdir/tools/starpu_paje_state_stats.R tools/starpu_paje_state_stats.R
+  test -e tools/starpu_trace_state_stats.py || ln -sf $ac_abs_top_srcdir/tools/starpu_trace_state_stats.py tools/starpu_trace_state_stats.py
+  chmod +x tools/starpu_trace_state_stats.py
 ])
 
 # Create links to ICD files in build/socl/vendors directory. SOCL will use this
@@ -3224,18 +3368,6 @@ AC_SUBST(SOCL_VENDORS)
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h gcc-plugin/include/starpu-gcc/config.h starpu-top/config.h)
 
-AH_BOTTOM([
-#if defined(STARPU_DEVEL) && defined(BUILDING_STARPU)
-#  ifndef STARPU_CHECKED_UNISTD_H
-#    define STARPU_CHECKED_UNISTD_H
-#    ifdef _UNISTD_H
-#      define _UNISTD_H PLEASE_DONT_INCLUDE_IT
-#      error Please do not unconditionally include unistd.h, it is not available on Windows, include config.h and test for HAVE_UNISTD_H
-#    endif
-#  endif
-#endif
-])
-
 SANITIZE=$(echo $CFLAGS | grep sanitize)
 AM_CONDITIONAL(STARPU_SANITIZE, test -n "$SANITIZE")
 
@@ -3355,6 +3487,7 @@ AC_MSG_NOTICE([
 	       StarPU MPI(nmad) enabled:                      $build_nmad_lib
 	       MPI test suite:                                $running_mpi_check
 	       Master-Slave MPI enabled:                      $use_mpi_master_slave
+	       StarPU DSM enabled:                            $enable_dsm
 	       FFT Support:                                   $fft_support
 	       GCC plug-in:                                   $build_gcc_plugin
 	       GCC plug-in test suite (requires GNU Guile):   $run_gcc_plugin_test_suite
@@ -3384,3 +3517,12 @@ WARNING: hwloc was not enabled.  If the target machine is hyperthreaded the
 performance may be impacted a lot.  It is strongly recommended to install
 hwloc])
 fi
+
+if test x"$starpu_windows" = xyes -a "x$STARPU_MS_LIB" = "x"
+then
+  AC_MSG_NOTICE([
+WARNING: lib was not found, you will not be able to build StarPU applications
+with Microsoft Visual Studio. Add to your PATH the directories for MSVC, e.g
+   c:\Program Files (x86)\Microsoft Visual Studio 11.0\Common7\IDE;
+   c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin])
+fi

+ 43 - 5
doc/doxygen/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2011, 2013-2014  Université de Bordeaux
+# Copyright (C) 2009, 2011, 2013-2014, 2017  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
 # Copyright (C) 2014  INRIA
 #
@@ -29,9 +29,11 @@ DOX_TAG = starpu.tag
 
 txtdir   = $(docdir)/manual
 
+EXTRA_DIST = 
+
 if BUILD_DOC
 all: $(DOX_HTML_DIR) $(DOX_PDF)
-EXTRA_DIST = $(DOX_HTML_DIR) $(DOX_PDF)
+EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 install-exec-hook:
@@ -41,7 +43,7 @@ uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 else
 if AVAILABLE_DOC
-EXTRA_DIST = $(top_srcdir)/doc/doxygen/html $(top_srcdir)/doc/doxygen/starpu.pdf
+EXTRA_DIST += $(top_srcdir)/doc/doxygen/html $(top_srcdir)/doc/doxygen/starpu.pdf
 txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen/html
 install-exec-hook:
@@ -52,8 +54,6 @@ uninstall-hook:
 endif
 endif
 
-
-if BUILD_DOC
 chapters =	\
 	chapters/000_introduction.doxy		\
 	chapters/101_building.doxy		\
@@ -145,6 +145,40 @@ chapters =	\
 	chapters/api/modularized_scheduler.doxy \
 	chapters/api/clustering_machine.doxy
 
+images = 	\
+	chapters/images/data_trace.eps \
+	chapters/images/data_trace.pdf \
+	chapters/images/data_trace.png \
+	chapters/images/distrib_data.eps \
+	chapters/images/distrib_data.pdf \
+	chapters/images/distrib_data.png \
+	chapters/images/distrib_data_histo.eps \
+	chapters/images/distrib_data_histo.pdf \
+	chapters/images/distrib_data_histo.png \
+	chapters/images/paje_draw_histogram.eps \
+	chapters/images/paje_draw_histogram.pdf \
+	chapters/images/paje_draw_histogram.png \
+	chapters/images/parallel_worker2.eps \
+	chapters/images/parallel_worker2.pdf \
+	chapters/images/parallel_worker2.png \
+	chapters/images/runtime-par.eps \
+	chapters/images/runtime-par.pdf \
+	chapters/images/runtime-par.png \
+	chapters/images/starpu_non_linear_memset_regression_based.eps \
+	chapters/images/starpu_non_linear_memset_regression_based.pdf \
+	chapters/images/starpu_non_linear_memset_regression_based.png \
+	chapters/images/starpu_non_linear_memset_regression_based_2.eps \
+	chapters/images/starpu_non_linear_memset_regression_based_2.pdf \
+	chapters/images/starpu_non_linear_memset_regression_based_2.png \
+	chapters/images/starpu_starpu_slu_lu_model_11.eps \
+	chapters/images/starpu_starpu_slu_lu_model_11.pdf \
+	chapters/images/starpu_starpu_slu_lu_model_11.png \
+	chapters/images/tasks_size_overhead.eps \
+	chapters/images/tasks_size_overhead.pdf \
+	chapters/images/tasks_size_overhead.png \
+	chapters/images/temanejo.png
+
+if BUILD_DOC
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 
@@ -283,6 +317,10 @@ CLEANFILES = $(DOX_TAG) starpu_config.h \
     $(DOX_PDF)
 
 endif
+
+EXTRA_DIST += doxygen.cfg refman.tex \
+	      $(chapters) $(images)
+
 # Rule to update documentation on web server. Should only be used locally.
 PUBLISHHOST	?= gforge
 update-web: $(DOX_PDF)

+ 7 - 3
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -197,9 +197,13 @@ structures of StarPU by describing the shape of your machine and/or your
 application at the configure step.
 
 To reduce the memory footprint of the data internal structures of StarPU, one
-can set the \ref enable-maxcpus "--enable-maxcpus", \ref enable-maxcudadev
-"--enable-maxcudadev", \ref enable-maxopencldev "--enable-maxopencldev" and
-\ref enable-maxnodes "--enable-maxnodes" configure parameters to give StarPU
+can set the
+\ref enable-maxcpus "--enable-maxcpus",
+\ref enable-maxnumanodes "--enable-maxnumanodes",
+\ref enable-maxcudadev "--enable-maxcudadev",
+\ref enable-maxopencldev "--enable-maxopencldev" and
+\ref enable-maxnodes "--enable-maxnodes"
+configure parameters to give StarPU
 the architecture of the machine it will run on, thus tuning the size of the
 structures to the machine.
 

+ 23 - 8
doc/doxygen/chapters/310_data_management.doxy

@@ -84,7 +84,7 @@ example on how to do so by using starpu_bcsr_data_register().
  *
  * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
  * colind = [0, 0, 1]
- * rowptr = [0, 1 ]
+ * rowptr = [0, 1, 3]
  * r = c = 2
  */
 
@@ -92,7 +92,7 @@ example on how to do so by using starpu_bcsr_data_register().
 int R = 2;
 int C = 2;
 
-int NROW = 2;
+int NROWS = 2;
 int NNZ_BLOCKS = 3;    /* out of 4 */
 int NZVAL_SIZE = (R*C*NNZ_BLOCKS);
 
@@ -108,17 +108,18 @@ uint32_t colind[NNZ_BLOCKS] =
 	0, /* block-column index for second block in nzval */
 	1  /* block-column index for third block in nzval */
 };
-uint32_t rowptr[NROW] =
+uint32_t rowptr[NROWS+1] =
 {
 	0, / * block-index in nzval of the first block of the first row. */
-	1  / * block-index in nzval of the first block of the second row. */
+	1, / * block-index in nzval of the first block of the second row. */
+	NNZ_BLOCKS /* number of blocks, to allow an easier element's access for the kernels */
 };
 
 starpu_data_handle_t bcsr_handle;
 starpu_bcsr_data_register(&bcsr_handle,
 			  STARPU_MAIN_RAM,
 			  NNZ_BLOCKS,
-			  NROW,
+			  NROWS,
 			  (uintptr_t) nzval,
 			  colind,
 			  rowptr,
@@ -244,6 +245,8 @@ back to its home node, and evict it from GPUs when room is needed.
 An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
 
 \code{.c}
+#define NX 1048576
+#define PARTS 16
 int vector[NX];
 starpu_data_handle_t handle;
 
@@ -287,7 +290,7 @@ Partitioning can be applied several times, see
 Wherever the whole piece of data is already available, the partitioning will
 be done in-place, i.e. without allocating new buffers but just using pointers
 inside the existing copy. This is particularly important to be aware of when
-using OpenCL, where the kernel parameters are not pointers, but handles. The
+using OpenCL, where the kernel parameters are not pointers, but cl_mem handles. The
 kernel thus needs to be also passed the offset within the OpenCL buffer:
 
 \code{.c}
@@ -315,7 +318,10 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 
 StarPU provides various interfaces and filters for matrices, vectors, etc.,
 but applications can also write their own data interfaces and filters, see
-<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
+<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example,
+and see \ref DefiningANewDataInterface and \ref DefiningANewDataFilter
+for documentation.
+
 
 \section AsynchronousPartitioning Asynchronous Partitioning
 
@@ -431,6 +437,14 @@ starpu_data_invalidate_submit(handle);
 
 And now we can start using vertical slices, etc.
 
+\section DefiningANewDataFilter Defining A New Data Filter
+
+StarPU provides a series of predefined filters in API_Data_Partition, but
+additional filters can be defined by the application. The principle is that the
+filter function just fills the memory location of the i-th subpart of a data.
+Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
+and see \ref starpu_data_filter::filter_func for the details.
+
 \section DataReduction Data Reduction
 
 In various cases, some piece of data is used to accumulate intermediate
@@ -759,7 +773,8 @@ Different operations need to be defined for a data interface through
 the type starpu_data_interface_ops. We only define here the basic
 operations needed to run simple applications. The source code for the
 different functions can be found in the file
-<c>examples/interface/complex_interface.c</c>.
+<c>examples/interface/complex_interface.c</c>, the details of the hooks to be
+provided are documented \ref starpu_data_interface_ops .
 
 \code{.c}
 static struct starpu_data_interface_ops interface_complex_ops =

+ 12 - 2
doc/doxygen/chapters/320_scheduling.doxy

@@ -23,8 +23,8 @@ This means scheduling policies usually contain at least one queue of tasks to
 store them between the time when they become available, and the time when a
 worker gets to grab them.
 
-By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
-because it provides correct load balance even if the application codelets do
+By default, StarPU uses the work-stealing scheduler <c>lws</c>. This is
+because it provides correct load balance and locality even if the application codelets do
 not have performance models. Other non-modelling scheduling policies can be
 selected among the list below, thanks to the environment variable \ref
 STARPU_SCHED. For instance <c>export STARPU_SCHED=dmda</c> . Use <c>help</c> to
@@ -180,6 +180,16 @@ task->execute_on_a_specific_worker = 1;
 task->workerid = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
 \endcode
 
+One can also specify a set worker(s) which are allowed to take the task, as an
+array of bit, for instance to allow workers 2 and 42:
+
+\code{.c}
+task->workerids = calloc(2,sizeof(uint32_t));
+task->workerids[2/32] |= (1 << (2%32));
+task->workerids[42/32] |= (1 << (42%32));
+task->workerids_len = 2;
+\endcode
+
 One can also specify the order in which tasks must be executed by setting the
 starpu_task::workerorder field. If this field is set to a non-zero value, it
 provides the per-worker consecutive order in which tasks will be executed,

+ 12 - 0
doc/doxygen/chapters/390_faq.doxy

@@ -195,6 +195,18 @@ security.models.extensions.user_set_cpu_affinity=1
 \endverbatim
 
 
+\section StarPUEatsCPUs StarPU permanently eats 100% of all CPUs
+
+Yes, this is on purpose.
+
+By default, StarPU uses active polling on task queues, so as to minimize wake-up
+latency for better overall performance.
+
+If eating CPU time is a problem (e.g. application running on a desktop),
+pass option \ref enable-blocking-drivers "--enable-blocking-drivers" to
+<c>./configure</c>. This will add some overhead when putting CPU workers to
+sleep or waking them, but avoid eating 100% CPU permanently.
+
 \section PauseResume Interleaving StarPU and non-StarPU code
 
 If your application only partially uses StarPU, and you do not want to

+ 3 - 1
doc/doxygen/chapters/401_out_of_core.doxy

@@ -13,7 +13,7 @@ When using StarPU, one may need to store more data than what the main memory
 disk and to use it.
 
 The principle is that one first registers a disk location, seen by StarPU as
-a <c>void*</c>, which can be for instance a Unix path for the stdio or unistd case,
+a <c>void*</c>, which can be for instance a Unix path for the stdio, unistd or unistd_o_direct case,
 or a database file path for a leveldb case, etc. The disk backend opens this
 place with the plug method.
 
@@ -48,6 +48,8 @@ export STARPU_DISK_SWAP_BACKEND=unistd
 export STARPU_DISK_SWAP_SIZE=200
 \endverbatim
 
+The backend can be set to stdio, unistd, unistd_o_direct, or leveldb.
+
 When the register function is called, StarPU will benchmark the disk. This can
 take some time.
 

+ 136 - 20
doc/doxygen/chapters/410_mpi_support.doxy

@@ -542,6 +542,12 @@ Here we have disabled the kernel function call to skip the actual computation
 time and only keep submission time, and we have asked StarPU to fake running on
 MPI node 2 out of 1024 nodes.
 
+To tune the placement of tasks among MPI nodes, one can use
+::STARPU_EXECUTE_ON_NODE or ::STARPU_EXECUTE_ON_DATA to specify an explicit
+node, or the node of a given data (e.g. one of the parameters), or use
+starpu_mpi_node_selection_register_policy() and ::STARPU_NODE_SELECTION_POLICY
+to provide a dynamic policy.
+
 A function starpu_mpi_task_build() is also provided with the aim to
 only construct the task structure. All MPI nodes need to call the
 function, only the node which is to execute the task will return a
@@ -563,6 +569,105 @@ starpu_mpi_task_post_build(MPI_COMM_WORLD, &cl,
                            0);
 \endcode
 
+\section MPITemporaryData Temporary Data
+
+To be able to use starpu_mpi_task_insert(), one has to call
+starpu_mpi_data_register(), so that StarPU-MPI can know what it needs to do for
+each data. Parameters of starpu_mpi_data_register() are normally the same on all
+nodes for a given data, so that all nodes agree on which node owns the data, and
+which tag is used to transfer its value.
+
+It can however be useful to register e.g. some temporary data on just one node,
+without having to register a dumb handle on all nodes, while only one node will
+actually need to know about it. In that case, nodes which will not need the data
+can just pass NULL to starpu_mpi_task_insert():
+
+\code{.c}
+starpu_data_handle_t data0 = NULL;
+if (rank == 0) {
+	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
+	starpu_mpi_data_register(data0, 0, rank);
+}
+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl, STARPU_W, data0, 0); /* Executes on node 0 */
+\endcode
+
+Here, nodes whose rank is not 0 will simply not take care of the data, and consider it to be on another node.
+
+This can be mixed various way, for instance here node 1 determines that it does
+not have to care about data0, but knows that it should send the value of its
+data1 to node 0, which owns data and thus will need the value of data1 to execute the task:
+
+\code{.c}
+starpu_data_handle_t data0 = NULL, data1, data;
+if (rank == 0) {
+	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
+	starpu_mpi_data_register(data0, -1, rank);
+	starpu_variable_data_register(&data1, -1, 0, sizeof(val1));
+	starpu_variable_data_register(&data, STARPU_MAIN_RAM, (uintptr_t) &val, sizeof(val));
+} else if (rank == 1) {
+	starpu_variable_data_register(&data1, STARPU_MAIN_RAM, (uintptr_t) &val1, sizeof(val1));
+	starpu_variable_data_register(&data, -1, 0, sizeof(val));
+}
+starpu_mpi_data_register(data, 42, 0);
+starpu_mpi_data_register(data1, 43, 1);
+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl, STARPU_W, data, STARPU_R, data0, STARPU_R, data1, 0); /* Executes on node 0 */
+\endcode
+
+\section MPIPerNodeData Per-node Data
+
+Further than temporary data on just one node, one may want per-node data,
+to e.g. replicate some computation because that is less expensive than
+communicating the value over MPI:
+
+\code{.c}
+starpu_data_handle pernode, data0, data1;
+starpu_variable_data_register(&pernode, -1, 0, sizeof(val));
+starpu_mpi_data_register(pernode, -1, STARPU_MPI_PER_NODE);
+
+/* Normal data: one on node0, one on node1 */
+if (rank == 0) {
+	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
+	starpu_variable_data_register(&data1, -1, 0, sizeof(val1));
+} else if (rank == 1) {
+	starpu_variable_data_register(&data0, -1, 0, sizeof(val1));
+	starpu_variable_data_register(&data1, STARPU_MAIN_RAM, (uintptr_t) &val1, sizeof(val1));
+}
+starpu_mpi_data_register(data0, 42, 0);
+starpu_mpi_data_register(data1, 43, 1);
+
+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl, STARPU_W, pernode, 0); /* Will be replicated on all nodes */
+
+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl2, STARPU_RW, data0, STARPU_R, pernode); /* Will execute on node 0, using its own pernode*/
+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl2, STARPU_RW, data1, STARPU_R, pernode); /* Will execute on node 1, using its own pernode*/
+\endcode
+
+One can turn a normal data into pernode data, by first broadcasting it to all nodes:
+
+\code{.c}
+starpu_data_handle data;
+starpu_variable_data_register(&data, -1, 0, sizeof(val));
+starpu_mpi_data_register(data, 42, 0);
+
+/* Compute some value */
+starpu_mpi_task_insert(MPI_COMM_WORLD, &cl, STARPU_W, data, 0); /* Node 0 computes it */
+
+/* Get it on all nodes */
+starpu_mpi_get_data_on_all_nodes_detached(MPI_COMM_WORLD, data);
+/* And turn it per-node */
+starpu_mpi_data_set_rank(data, STARPU_MPI_PER_NODE);
+\endcode
+
+The data can then be used just like pernode above.
+
+\section MPIPriorities Priorities
+
+All send functions have a <c>_prio</c> variant which takes an additional
+priority parameter, which allows to make StarPU-MPI change the order of MPI
+requests before submitting them to MPI. The default priority is 0.
+
+When using the starpu_mpi_task_insert helper, STARPU_PRIORITY defines both the
+task priority and the MPI requests priority.
+
 \section MPICache MPI cache support
 
 StarPU-MPI automatically optimizes duplicate data transmissions: if an MPI
@@ -769,27 +874,38 @@ data transfers and supports data matrices which do not fit in memory (out-of-cor
 
 \section MPIMasterSlave MPI Master Slave Support
 
-StarPU includes an other way to execute the application across many nodes. The Master
-Slave support permits to use remote cores without thinking about data distribution. This
-support can be activated with the \ref enable-mpi-master-slave "--enable-mpi-master-slave". However, you should not activate
-both MPI support and MPI Master-Slave support.
+StarPU provides an other way to execute applications across many
+nodes. The Master Slave support permits to use remote cores without
+thinking about data distribution. This support can be activated with
+the configure option \ref enable-mpi-master-slave
+"--enable-mpi-master-slave". However, you should not activate both MPI
+support and MPI Master-Slave support.
 
 If a codelet contains a kernel for CPU devices, it is automatically eligible to be executed
-on a MPI Slave device. However, you can decide to execute the codelet on a MPI Slave by filling
-the \ref starpu_codelet::mpi_ms_funcs variable. The functions have to be globally-visible (i.e. not static ) for
-StarPU to be able to look them up, and <c>-rdynamic</c> must be passed to gcc (or <c>-export-dynamic</c> to ld)
-so that symbols of the main program are visible.
-
-By default, one core is dedicated on the master to manage the entire set of slaves. If MPI
-has a good multiple threads support, you can use \ref with-mpi-master-slave-multiple-thread "--with-mpi-master-slave-multiple-thread"  to
-dedicate one core per slave.
-
-If you want to chose the number of cores on the slave device, use the \ref STARPU_NMPIMSTHREADS "STARPU_NMPIMSTHREADS=\<number\>"
-with <c>\<number\></c> is the number of cores wanted. The default value is all the slave's cores. To select
-the number of slaves nodes, change the <c>-n</c> parameter when executing the application with mpirun
-or mpiexec.
-
-The node chosen by default is the with the MPI rank 0. To modify this, use the environment variable
-\ref STARPU_MPI_MASTER_NODE "STARPU_MPI_MASTER_NODE=\<number\>" with <c>\<number\></c> is the MPI rank wanted.
+on a MPI Slave device. Moreover, you can force the execution on a MPI Slave by setting
+the field \ref starpu_codelet::mpi_ms_funcs. Functions have to be
+globally-visible (i.e. not static) for StarPU to be able to look them
+up, and <c>-rdynamic</c> must be passed to gcc (or
+<c>-export-dynamic</c> to ld) so that symbols of the main program are
+visible.
+
+By default, one core is dedicated on the master node to manage the
+entire set of slaves. If the implementation of MPI you are using has a
+good multiple threads support, you can use the configure option
+\ref with-mpi-master-slave-multiple-thread "--with-mpi-master-slave-multiple-thread"
+to dedicate one core per slave.
+
+Choosing the number of cores on each slave device is done by setting
+the environment variable \ref STARPU_NMPIMSTHREADS "STARPU_NMPIMSTHREADS=\<number\>"
+with <c>\<number\></c> being the requested number of cores. By default
+all the slave's cores are used.
+
+Setting the number of slaves nodes is done by changing the <c>-n</c>
+parameter when executing the application with mpirun or mpiexec.
+
+The master node is by default the node with the MPI rank equal to 0.
+To select another node, use the environment variable \ref
+STARPU_MPI_MASTER_NODE "STARPU_MPI_MASTER_NODE=\<number\>" with
+<c>\<number\></c> being the requested MPI rank node.
 
 */

+ 48 - 2
doc/doxygen/chapters/501_environment_variables.doxy

@@ -205,6 +205,21 @@ set.
 
 </dd>
 
+<dt>STARPU_MAIN_THREAD_CPUID</dt>
+<dd>
+\anchor STARPU_MAIN_THREAD_CPUID
+\addindex __env__STARPU_MAIN_THREAD_CPUID
+When defined, this make StarPU bind the thread that calls starpu_initialize() to
+the given CPU ID.
+</dd>
+
+<dt>STARPU_MPI_THREAD_CPUID</dt>
+<dd>
+\anchor STARPU_MPI_THREAD_CPUID
+\addindex __env__STARPU_MPI_THREAD_CPUID
+When defined, this make StarPU bind its MPI thread to the given CPU ID.
+</dd>
+
 <dt>STARPU_WORKERS_CUDAID</dt>
 <dd>
 \anchor STARPU_WORKERS_CUDAID
@@ -842,7 +857,15 @@ that have a limited amount of memory.
 \addindex __env__STARPU_LIMIT_CPU_MEM
 This variable specifies the maximum number of megabytes that should be
 available to the application in the main CPU memory. Setting it enables allocation
-cache in main memory
+cache in main memory. Setting it to zero lets StarPU overflow memory.
+</dd>
+
+<dt>STARPU_LIMIT_CPU_NUMA_devid_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CPU_NUMA_devid_MEM
+\addindex __env__STARPU_LIMIT_CPU_NUMA_devid_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on the NUMA node with the OS identifier <c>devid</c>.
 </dd>
 
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
@@ -905,7 +928,8 @@ full.
 This specifies then backend to be used by StarPU to push data when the main
 memory is getting full. The default is unistd (i.e. using read/write functions),
 other values are stdio (i.e. using fread/fwrite), unistd_o_direct (i.e. using
-read/write with O_DIRECT), and leveldb (i.e. using a leveldb database).
+read/write with O_DIRECT), leveldb (i.e. using a leveldb database), and hdf5 
+(i.e. using HDF5 library).
 </dd>
 
 <dt>STARPU_DISK_SWAP_SIZE</dt>
@@ -1024,6 +1048,15 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 (see \ref DetectionStuckConditions)
 </dd>
 
+<dt>STARPU_WATCHDOG_DELAY</dt>
+<dd>
+\anchor STARPU_WATCHDOG_DELAY
+\addindex __env__STARPU_WATCHDOG_DELAY
+This delays the activation of the watchdog by the given time (in µs). This can
+be convenient for letting the application initialize data etc. before starting
+to look for idle time.
+</dd>
+
 <dt>STARPU_TASK_BREAK_ON_PUSH</dt>
 <dd>
 \anchor STARPU_TASK_BREAK_ON_PUSH
@@ -1109,6 +1142,19 @@ implements an advanced but centralized management of concurrent data
 accesses (see \ref ConcurrentDataAccess).
 </dd>
 
+<dt>STARPU_USE_NUMA</dt>
+<dd>
+\anchor STARPU_USE_NUMA 
+\addindex __env__STARPU_USE_NUMA
+When defined, NUMA nodes are taking into account by StarPU. Otherwise, memory
+is considered as only one node. This is experimental for now.
+
+When enabled, STARPU_MAIN_MEMORY is a pointer to the NUMA node associated to the
+first CPU worker if it exists, the NUMA node associated to the first GPU discovered otherwise.
+If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the first NUMA node
+discovered by StarPU.
+</dd>
+
 </dl>
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 41 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -122,6 +122,18 @@ used by nvcc.
 
 <dl>
 
+<dt>--enable-blocking-drivers</dt>
+<dd>
+\anchor enable-blocking-drivers
+\addindex __configure__--enable-blocking-drivers
+By default, StarPU keeps CPU workers awake permanently, for better
+reactivity. This option makes StarPU put CPU workers to real sleep when there
+are not enough tasks to compute.
+
+Use at most <c>count</c> CPU cores.  This information is then
+available as the macro ::STARPU_MAXCPUS.
+</dd>
+
 <dt>--enable-maxcpus=<c>count</c></dt>
 <dd>
 \anchor enable-maxcpus
@@ -130,6 +142,14 @@ Use at most <c>count</c> CPU cores.  This information is then
 available as the macro ::STARPU_MAXCPUS.
 </dd>
 
+<dt>--enable-maxnumanodes=<c>count</c></dt>
+<dd>
+\anchor enable-maxnumanodes
+\addindex __configure__--enable-maxnumanodes
+Use at most <c>count</c> NUMA nodes.  This information is then
+available as the macro ::STARPU_MAXNUMANODES.
+</dd>
+
 <dt>--disable-cpu</dt>
 <dd>
 \anchor disable-cpu
@@ -486,6 +506,27 @@ Specify the blas library to be used by some of the examples. Librairies availabl
 Enable linking with LevelDB if available
 </dd>
 
+<dt>--disable-hdf5</dt>
+<dd>
+\anchor disable-hdf5
+\addindex __configure__--disable-hdf5
+Disable building HDF5 support.
+</dd>
+
+<dt>--with-hdf5-include-dir=<c>path</></dt>
+<dd>
+\anchor with-hdf5-include-dir
+\addindex __configure__--with-hdf5-include-dir
+Specify the directory where is stored the header hdf5.h.
+</dd>
+
+<dt>--with-hdf5-lib-dir=<c>path</c></dt>
+<dd>
+\anchor with-hdf5-lib-dir
+\addindex __configure__--with-hdf5-lib-dir
+Specify the directory where is stored the hdf5 library.
+</dd>
+
 <dt>--disable-starpufft</dt>
 <dd>
 \anchor disable-starpufft

+ 11 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -646,6 +646,17 @@ the configuration of a task allocated with starpu_task_create().
     details. This field is ignored if the field
     starpu_task::execute_on_a_specific_worker is set to 0.
 
+\var unsigned starpu_task::workerids
+    Optional field. If the field
+    starpu_task::workerids_len is different from 0, this field indicates an
+    array of bits (stored as uint32_t values) which indicate the set of workers
+    which are allowed to execute the task. starpu_task::workerid takes
+    precedence over this.
+
+\var unsigned starpu_task::workerids_len
+    Optional field. This provides the number of uint32_t values in the
+    starpu_task::workerids array.
+
 \var starpu_task_bundle_t starpu_task::bundle
     Optional field. The bundle that includes this task. If no bundle
     is used, this should be <c>NULL</c>.

+ 85 - 12
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -14,14 +14,23 @@ Per-interface data transfer methods.
 \var void (*starpu_data_interface_ops::register_data_handle)(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
     Register an existing interface into a data handle.
 
+    This iterates over all memory nodes to initialize all fields of the data
+    interface on each of them. Since data is not allocated yet except on the
+    home node, pointers should be left as NULL except on the \p home_node, for
+    which the pointers should be copied from the given \p data_interface, which
+    was filled with the application's pointers.
+
 \var starpu_ssize_t (*starpu_data_interface_ops::allocate_data_on_node)(void *data_interface, unsigned node)
-    Allocate data for the interface on a given node.
+    Allocate data for the interface on a given node. This should use
+    starpu_malloc_on_node to perform the allocation(s), and fill the pointers
+    in the data interface. It should return the size of the allocated memory, or
+    -ENOMEM if memory could not be allocated.
 
 \var void (*starpu_data_interface_ops::free_data_on_node)(void *data_interface, unsigned node)
     Free data of the interface on a given node.
 
 \var const struct starpu_data_copy_methods *starpu_data_interface_ops::copy_methods
-    ram/cuda/opencl synchronous and asynchronous transfer methods.
+    This provides a series of methods for performing ram/cuda/opencl synchronous and asynchronous transfers.
 
 \var void *(*starpu_data_interface_ops::handle_to_pointer)(starpu_data_handle_t handle, unsigned node)
     Return the current pointer (if any) for the handle on the given node.
@@ -30,20 +39,25 @@ Per-interface data transfer methods.
     Return an estimation of the size of data, for performance models.
 
 \var uint32_t (*starpu_data_interface_ops::footprint)(starpu_data_handle_t handle)
-    Return a 32bit footprint which characterizes the data size.
+    Return a 32bit footprint which characterizes the data size and layout (nx, ny, ld, elemsize, etc.)
 
 \var int (*starpu_data_interface_ops::compare)(void *data_interface_a, void *data_interface_b)
-    Compare the data size of two interfaces.
+    Compare the data size and layout of two interfaces (nx, ny, ld, elemsize,
+    etc.). It should return 1 if the two interfaces size and layout match, and 0
+    otherwise.
 
 \var void (*starpu_data_interface_ops::display)(starpu_data_handle_t handle, FILE *f)
     Dump the sizes of a handle to a file.
 
 \var starpu_ssize_t (*starpu_data_interface_ops::describe)(void *data_interface, char *buf, size_t size)
-    Describe the data into a string.
+    Describe the data into a string in a brief way, such as one letter to describe the type of data, and the data dimensions.
 
 \var enum starpu_data_interface_id starpu_data_interface_ops::interfaceid
     An identifier that is unique to each interface.
 
+\var char *starpu_data_interface_ops::name
+    Name of the interface
+
 \var size_t starpu_data_interface_ops::interface_size
     The size of the interface data descriptor.
 
@@ -407,11 +421,70 @@ Compressed Sparse Row Representation) sparse matrix interface.
 Register the sparse matrix made of \p nnz non-zero blocks of elements of
 size \p elemsize stored in \p nzval and initializes \p handle to represent it.
 Blocks have size \p r * \p c. \p nrow is the number of rows (in terms of
-blocks), \p colind[i] is the block-column index for block i in \p nzval,
-\p rowptr[i] is the block-index (in \p nzval) of the first block of row i.
+blocks), \p colind is an array of nnz elements, colind[i] is the block-column index for block i in \p nzval,
+\p rowptr is an array of nrow+1 elements, rowptr[i] is the block-index (in \p nzval) of the first block of row i. By convention, rowptr[nrow] is the number of blocks, this allows an easier access of the matrix's elements for the kernels.
 \p firstentry is the index of the first entry of the given arrays
 (usually 0 or 1).
 
+Here an example of how to use the function.
+\code{.c}
+/*
+ * We use the following matrix:
+ *
+ *   +----------------+
+ *   |  0   1   0   0 |
+ *   |  2   3   0   0 |
+ *   |  4   5   8   9 |
+ *   |  6   7  10  11 |
+ *   +----------------+
+ *
+ * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
+ * colind = [0, 0, 1]
+ * rowptr = [0, 1, 3]
+ * r = c = 2
+ */
+
+/* Size of the blocks */
+int R = 2;
+int C = 2;
+
+int NROWS = 2;
+int NNZ_BLOCKS = 3;    /* out of 4 */
+int NZVAL_SIZE = (R*C*NNZ_BLOCKS);
+
+int nzval[NZVAL_SIZE]  =
+{
+	0, 1, 2, 3,    /* First block  */
+	4, 5, 6, 7,    /* Second block */
+	8, 9, 10, 11   /* Third block  */
+};
+uint32_t colind[NNZ_BLOCKS] =
+{
+	0, /* block-column index for first block in nzval */
+	0, /* block-column index for second block in nzval */
+	1  /* block-column index for third block in nzval */
+};
+uint32_t rowptr[NROWS+1] =
+{
+	0, / * block-index in nzval of the first block of the first row. */
+	1, / * block-index in nzval of the first block of the second row. */
+	NNZ_BLOCKS /* number of blocks, to allow an easier element's access for the kernels */
+};
+
+starpu_data_handle_t bcsr_handle;
+starpu_bcsr_data_register(&bcsr_handle,
+			  STARPU_MAIN_RAM,
+			  NNZ_BLOCKS,
+			  NROWS,
+			  (uintptr_t) nzval,
+			  colind,
+			  rowptr,
+			  0, /* firstentry */
+			  R,
+			  C,
+			  sizeof(nzval[0]));
+\endcode
+
 \fn void starpu_csr_data_register(starpu_data_handle_t *handle, int home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
 \ingroup API_Data_Interfaces
 This variant of starpu_data_register() uses the CSR (Compressed
@@ -793,16 +866,16 @@ row representation)
 \var uintptr_t starpu_bcsr_interface::nzval
     non-zero values
 \var uint32_t *starpu_bcsr_interface::colind
-    position of non-zero entried on the row
+    array of nnz elements, colind[i] is the block-column index for block i in nzval
 \var uint32_t *starpu_bcsr_interface::rowptr
-    index (in nzval) of the first entry of the row
+    array of nrow+1 elements, rowptr[i] is the block-index (in nzval) of the first block of row i. By convention, rowptr[nrow] is the number of blocks, this allows an easier access of the matrix's elements for the kernels.
 \var starpu_bcsr_interface::firstentry
     k for k-based indexing (0 or 1 usually). Also useful when partitionning the matrix.
 \var uint32_t starpu_bcsr_interface::r
-    size of the blocks
+    height of the blocks
 \var uint32_t starpu_bcsr_interface::c
-    size of the blocks
-\var size_t starpu_bcsr_interface::elemsize;
+    width of the blocks
+\var size_t starpu_bcsr_interface::elemsize
     size of the elements of the matrix
 
 \fn uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle)

+ 5 - 1
doc/doxygen/chapters/api/data_management.doxy

@@ -2,7 +2,7 @@
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
- * Copyright (C) 2011, 2012 INRIA
+ * Copyright (C) 2011, 2012, 2017  INRIA
  * See the file version.doxy for copying conditions.
  */
 
@@ -104,6 +104,10 @@ data to StarPU, the specified memory node indicates where the piece of
 data initially resides (we also call this memory node the home node of
 a piece of data).
 
+In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
+and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
+numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
+
 \fn void starpu_data_register(starpu_data_handle_t *handleptr, int home_node, void *data_interface, struct starpu_data_interface_ops *ops)
 \ingroup API_Data_Management
 Register a piece of data into the handle located at the

+ 7 - 0
doc/doxygen/chapters/api/data_out_of_core.doxy

@@ -132,4 +132,11 @@ This set uses the leveldb created by Google <br />
 More information at https://code.google.com/p/leveldb/ <br />
 It doesn't support asynchronous transfers.
 
+\var starpu_disk_hdf5_ops
+\ingroup API_Out_Of_Core
+This set uses the HDF5 library.<br />
+<strong>It doesn't support multiple opening from different processes. </strong> <br />
+You may only allow one process to write in the HDF5 file. <br />
+<strong>If HDF5 library is not compiled with --thread-safe you can't open more than one HDF5 file at the same time. </strong>
+
 */

+ 28 - 5
doc/doxygen/chapters/api/data_partition.doxy

@@ -12,10 +12,24 @@
 The filter structure describes a data partitioning operation, to be
 given to the starpu_data_partition() function.
 \ingroup API_Data_Partition
-\var void (*starpu_data_filter::filter_func)(void *father_interface, void *child_interface, struct starpu_data_filter *, unsigned id, unsigned nparts)
+\var void (*starpu_data_filter::filter_func)(void *father_interface, void *child_interface, struct starpu_data_filter *filter, unsigned i, unsigned nparts)
     Fill the \p child_interface structure with interface information
-    for the \p id -th child of the parent \p father_interface (among
-    \p nparts).
+    for the \p i -th child of the parent \p father_interface (among
+    \p nparts). The \p filter structure is provided, allowing to inspect the
+    starpu_data_filter::filter_arg and starpu_data_filter::filter_arg_ptr
+    parameters.
+
+    The details of what needs to be filled in \p child_interface vary according
+    to the data interface, but generally speaking:
+    <ul>
+    <li> <c>id</c> is usually just copied over from the father, when the sub data has the same structure as the father, e.g. a subvector is a vector, a submatrix is a matrix, etc. This is however not the case for instance when dividing a BCSR matrix into its dense blocks, which then are matrices. </li>
+    <li> <c>nx</c>, <c>ny</c> and alike are usually divided by the number of subdata, depending how the subdivision is done (e.g. nx division vs ny division for vertical matrix division vs horizontal matrix division). </li>
+    <li> <c>ld</c> for matrix interfaces are usually just copied over: the leading dimension (ld) usually does not change. </li>
+    <li> <c>elemsize</c> is usually just copied over. </li>
+    <li> <c>ptr</c>, the pointer to the data, has to be computed according to \p i and the father's <c>ptr</c>, so as to point to the start of the sub data. This should however be done only if the father has <c>ptr</c> different from NULL: in the OpenCL case notably, the <c>dev_handle</c> and <c>offset</c> fields are used instead. </li>
+    <li> <c>dev_handle</c> should be just copied over from the parent. </li>
+    <li> <c>offset</c> has to be computed according to \p i and the father's <c>offset</c>, so as to provide the offset of the start of the sub data. This is notably used for the OpenCL case.
+    </ul>
 \var unsigned starpu_data_filter::nchildren
     Number of parts to partition the data into.
 \var unsigned (*starpu_data_filter::get_nchildren)(struct starpu_data_filter *, starpu_data_handle_t initial_handle)
@@ -27,9 +41,9 @@ given to the starpu_data_partition() function.
     this function returns which interface is used by child number \p
     id.
 \var unsigned starpu_data_filter::filter_arg
-    Allow to define an additional parameter for the filter function.
+    Additional parameter for the filter function
 \var void *starpu_data_filter::filter_arg_ptr
-    Allow to define an additional pointer parameter for the filter
+    Additional pointer parameter for the filter
     function, such as the sizes of the different parts.
 
 @name Basic API
@@ -220,6 +234,15 @@ of \p f must be the shadow size casted into \c void*.
 enforced for the shadowed parts. An usage example is available in
 examples/filters/shadow.c
 
+\fn void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\ingroup API_Data_Partition
+Return in \p child_interface the \p id th element of the vector
+represented by \p father_interface once partitioned into \p nparts chunks
+according to the <c>filter_arg_ptr</c> field of \p f. The
+<c>filter_arg_ptr</c> field must point to an array of \p nparts long
+elements, each of which specifies the number of elements in each chunk
+of the partition.
+
 \fn void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 Return in \p child_interface the \p id th element of the vector

+ 4 - 0
doc/doxygen/chapters/api/initialization.doxy

@@ -257,6 +257,10 @@ field starpu_conf::calibrate of \p conf. Upon successful
 completion, this function returns 0. Otherwise, <c>-EINVAL</c> indicates that
 the argument was <c>NULL</c>.
 
+\fn int starpu_is_initialized(void)
+\ingroup API_Initialization_and_Termination
+Return 1 if StarPU is already initialized.
+
 \fn void starpu_shutdown(void)
 \ingroup API_Initialization_and_Termination
 This is StarPU termination method. It must be called at the end of the

+ 5 - 2
doc/doxygen/chapters/api/modularized_scheduler.doxy

@@ -81,14 +81,17 @@ like <c>component->push_task(component,task)</c>
 \var int (*starpu_sched_component::can_push)(struct starpu_sched_component *component)
      This function is called by a component which implements a queue,
      allowing it to signify to its parents that an empty slot is
-     available in its queue. The basic implementation of this function
+     available in its queue. This should return 1 if some tasks could be pushed
+     The basic implementation of this function
      is a recursive call to its parents, the user has to specify a
      personally-made function to catch those calls.
-\var void (*starpu_sched_component::can_pull)(struct starpu_sched_component *component)
+\var int (*starpu_sched_component::can_pull)(struct starpu_sched_component *component)
      This function allow a component to wake up a worker. It is
      currently called by component which implements a queue, to
      signify to its children that a task have been pushed in its local
      queue, and is available to be popped by a worker, for example.
+     This should return 1 if some some container or worker could (or will) pull
+     some tasks.
      The basic implementation of this function is a recursive call to
      its children, until at least one worker have been woken up.
 

+ 41 - 0
doc/doxygen/chapters/api/mpi.doxy

@@ -82,6 +82,10 @@ Perform a standard-mode, blocking send of \p data_handle to the node
 \p dest using the message tag \p mpi_tag within the communicator \p
 comm.
 
+\fn int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm)
+\ingroup API_MPI_Support
+Similar to starpu_mpi_send, but takes a priority \p prio.
+
 \fn int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
 \ingroup API_MPI_Support
 Perform a standard-mode, blocking receive in \p data_handle from the
@@ -95,6 +99,10 @@ Post a standard-mode, non blocking send of \p data_handle to the node
 comm. After the call, the pointer to the request \p req can be used to
 test or to wait for the completion of the communication.
 
+\fn int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, int prio, MPI_Comm comm)
+\ingroup API_MPI_Support
+Similar to starpu_mpi_isend, but takes a priority \p prio.
+
 \fn int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm)
 \ingroup API_MPI_Support
 Post a nonblocking receive in \p data_handle from the node \p source
@@ -113,6 +121,10 @@ communication completes, its resources are automatically released back
 to the system, there is no need to test or to wait for the completion
 of the request.
 
+\fn int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
+\ingroup API_MPI_Support
+Similar to starpu_mpi_isend_detached, but takes a priority \p prio.
+
 \fn int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 \ingroup API_MPI_Support
 Post a nonblocking receive in \p data_handle from the node \p source
@@ -146,6 +158,10 @@ Perform a synchronous-mode, non-blocking send of \p data_handle to the node
 \p dest using the message tag \p mpi_tag within the communicator \p
 comm.
 
+\fn int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, int prio, MPI_Comm comm)
+\ingroup API_MPI_Support
+Similar to starpu_mpi_issend, but takes a priority \p prio.
+
 \fn int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 \ingroup API_MPI_Support
 Perform a synchronous-mode, non-blocking send of \p data_handle to the node
@@ -182,6 +198,10 @@ Post a standard-mode, non blocking send of \p data_handle to the node
 \p dest using the message tag \p mpi_tag within the communicator \p
 comm. On completion, \p tag is unlocked.
 
+\fn int starpu_mpi_isend_detached_unlock_tag_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm, starpu_tag_t tag)
+\ingroup API_MPI_Support
+Similar to starpu_mpi_isend_detached_unlock_tag, but takes a priority \p prio.
+
 \fn int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
 \ingroup API_MPI_Support
 Post a nonblocking receive in \p data_handle from the node \p source
@@ -196,6 +216,10 @@ array \p dest using the n-th message tag of the array \p mpi_tag
 within the n-th communicator of the array \p comm. On completion of
 the all the requests, \p tag is unlocked.
 
+\fn int starpu_mpi_isend_array_detached_unlock_tag_prio(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, int *prio, MPI_Comm *comm, starpu_tag_t tag)
+\ingroup API_MPI_Support
+Similar to starpu_mpi_isend_array_detached_unlock_tag, but takes a priority \p prio.
+
 \fn int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
 \ingroup API_MPI_Support
 Post \p array_size nonblocking receive. Each post receives in the n-th
@@ -315,6 +339,14 @@ Return the tag of the given data.
 Return the tag of the given data.
 Symbol kept for backward compatibility. Calling function starpu_mpi_data_get_tag()
 
+\def STARPU_MPI_PER_NODE
+\ingroup API_MPI_Support
+Can be used as rank when calling starpu_mpi_data_register() and alike, to
+specify that the data is per-node: each node will have its own value. Tasks
+writing to such data will be replicated on all nodes (and all parameters then
+have to be per-node). Tasks not writing to such data will just take the
+node-local value without any MPI communication.
+
 \fn void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new_rank)
 \ingroup API_MPI_Support
 Migrate the data onto the \p new_rank MPI node. This means both transferring
@@ -416,6 +448,11 @@ owner if needed. At least the target node and the owner have to call
 the function. On reception, the \p callback function is called with
 the argument \p arg.
 
+\fn void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
+\ingroup API_MPI_Support
+Transfer data \p data_handle to all MPI nodes, sending it from its
+owner if needed. All nodes have to call the function.
+
 @name Node Selection Policy
 \anchor MPINodeSelectionPolicy
 \ingroup API_MPI_Support
@@ -481,6 +518,10 @@ Unregister a previously registered policy.
 Perform a reduction on the given data \p handle. All nodes send the data to its
 owner node which will perform a reduction.
 
+\fn void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
+\ingroup API_MPI_Support
+Similar to starpu_mpi_redux_data, but takes a priority \p prio.
+
 \fn int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
 \ingroup API_MPI_Support
 Scatter data among processes of the communicator based on the

+ 4 - 0
doc/doxygen/chapters/api/performance_model.doxy

@@ -272,6 +272,10 @@ Return the architecture name for \p arch
 \ingroup API_Performance_Model
 Return the architecture type of the worker \p workerid.
 
+\fn void starpu_perfmodel_initialize(void)
+\ingroup API_Performance_Model
+If starpu_init is not used, starpu_perfmodel_initialize should be used before calling starpu_perfmodel_* functions.
+
 \fn int starpu_perfmodel_list(FILE *output)
 \ingroup API_Performance_Model
 Print a list of all performance models on \p output

+ 9 - 5
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -117,12 +117,16 @@ condition variable. For instance, in the case of a scheduling strategy
 with a single task queue, the same condition variable would be used to
 block and wake up all workers.
 
-\fn int starpu_wake_worker(int workerid)
+\fn int starpu_wake_worker_no_relax(int workerid)
 \ingroup API_Scheduling_Policy
-In simgrid or blocking driver mode, 
-this should be called by push functions to wake the potential workers that are
-supposed to pick up the tasks which just have been pushed, otherwise they may
-remain sleeping.
+Must be called to wake up a worker that is sleeping on the cond.
+Return 0 whenever the worker is not in a sleeping state or has the
+state_keep_awake flag on.
+
+\fn int starpu_wake_worker_locked(int workerid)
+\ingroup API_Scheduling_Policy
+Version of starpu_wake_worker_no_relax() which assumes that the sched
+mutex is locked
 
 \fn int starpu_sched_set_min_priority(int min_prio)
 \ingroup API_Scheduling_Policy

+ 18 - 1
doc/doxygen/chapters/api/workers.doxy

@@ -250,6 +250,16 @@ Return the type of \p node as defined by
 this function should be used in the allocation function to determine
 on which device the memory needs to be allocated.
 
+\fn int starpu_memory_nodes_numa_id_to_devid(int osid)
+\ingroup API_Workers_Properties
+Return the identifier of the memory node associated to the NUMA
+node identified by \p osid by the Operating System.
+
+\fn int starpu_memory_nodes_numa_devid_to_id(unsigned id);
+\ingroup API_Workers_Properties
+Return the Operating System identifier of the memory node
+whose StarPU identifier is \p id.
+
 \fn char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 \ingroup API_Workers_Properties
 Return worker \p type as a string.
@@ -277,7 +287,7 @@ Must be called after a potentially blocking call is complete, to
 restore the relax state in place before the corresponding relax_on.
 Decreases \c state_relax_refcnt. Calls to \ref starpu_worker_relax_on
 and \c starpu_worker_relax_off must be well parenthesized. This
-function is automatically called by \ref starpu_worker_unlock after the 
+function is automatically called by \ref starpu_worker_unlock after the
 target worker has been unlocked.
 
 \fn int starpu_worker_get_relax_state(void)
@@ -320,4 +330,11 @@ if needed during the waiting process. Returns 1 if \p workerid has been woken
 up or its state_keep_awake flag has been set to 1, and 0 otherwise (if \p
 workerid was not in the STATE_SLEEPING or in the STATE_SCHEDULING).
 
+\fn hwloc_cpuset_t starpu_worker_get_hwloc_cpuset(int workerid)
+\ingroup API_Workers_Properties
+If StarPU was compiled with hwloc support, returns a duplicate of the
+hwloc cpuset associated with the worker \p workerid. The returned cpuset is obtained
+from a \c hwloc_bitmap_dup() function call. It must be freed by the caller
+using \c hwloc_bitmap_free().
+
 */

+ 6 - 7
doc/doxygen/chapters/code/disk_compute.c

@@ -37,7 +37,7 @@ int main(int argc, char **argv)
 	/* Initialize path and name */
 	char pid_str[16];
 	int pid = getpid();
-	snprintf(pid_str, 16, "%d", pid);
+	snprintf(pid_str, sizeof(pid_str), "%d", pid);
 
 	const char *name_file_start = "STARPU_DISK_COMPUTE_DATA_";
 	const char *name_file_end = "STARPU_DISK_COMPUTE_DATA_RESULT_";
@@ -56,7 +56,7 @@ int main(int argc, char **argv)
 	int new_dd = starpu_disk_register(&starpu_disk_unistd_ops, (void *) base, 1024*1024*1);
 	/* can't write on /tmp/ */
 	if (new_dd == -ENOENT) goto enoent;
-	
+
 	unsigned dd = (unsigned) new_dd;
 
 	printf("TEST DISK MEMORY \n");
@@ -67,7 +67,7 @@ int main(int argc, char **argv)
 
 	starpu_malloc_flags((void **)&A, NX*sizeof(int), STARPU_MALLOC_COUNT);
 	starpu_malloc_flags((void **)&C, NX*sizeof(int), STARPU_MALLOC_COUNT);
- 
+
 	unsigned int j;
 	/* you register them in a vector */
 	for(j = 0; j < NX; ++j)
@@ -112,8 +112,8 @@ int main(int argc, char **argv)
 	/* register vector in starpu */
 	starpu_vector_data_register(&vector_handleA, dd, (uintptr_t) data, NX, sizeof(int));
 
-	/* and do what you want with it, here we copy it into an other vector */ 
-	starpu_vector_data_register(&vector_handleC, dd, (uintptr_t) data_result, NX, sizeof(int));	
+	/* and do what you want with it, here we copy it into an other vector */
+	starpu_vector_data_register(&vector_handleC, dd, (uintptr_t) data_result, NX, sizeof(int));
 
 	starpu_data_cpy(vector_handleC, vector_handleA, 0, NULL, NULL);
 
@@ -125,7 +125,7 @@ int main(int argc, char **argv)
 	starpu_disk_close(dd, data, NX*sizeof(int));
 	starpu_disk_close(dd, data_result, NX*sizeof(int));
 
-	/* check results */	
+	/* check results */
 	f = fopen(path_file_end, "rb+");
 	if (f == NULL)
 		goto enoent;
@@ -177,4 +177,3 @@ enoent:
 	return 77;
 }
 //! [To be included. You should update doxygen if you see this text.]
-

+ 10 - 2
examples/Makefile.am

@@ -157,7 +157,7 @@ examplebin_PROGRAMS 	+=	$(STARPU_EXAMPLES)
 
 TESTS			=	$(STARPU_EXAMPLES)
 
-if STARPU_NEW_CHECK
+if !STARPU_USE_MPI_MASTER_SLAVE
 TESTS			+=	scheduler/schedulers.sh
 TESTS			+=	scheduler/schedulers_context.sh
 endif
@@ -218,6 +218,7 @@ STARPU_EXAMPLES +=				\
 	mlr/mlr					\
 	cpp/incrementer_cpp			\
 	cpp/add_vectors				\
+	cpp/add_vectors_interface		\
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
@@ -245,7 +246,7 @@ STARPU_EXAMPLES +=				\
 	sched_ctx/dummy_sched_with_ctx		\
 	worker_collections/worker_tree_example  \
 	reductions/dot_product			\
-	reductions/minmax_reduction		
+	reductions/minmax_reduction
 
 endif
 
@@ -375,10 +376,14 @@ basic_examples_vector_scal_SOURCES =		\
 
 if STARPU_HAVE_ICC
 if STARPU_CROSS_COMPILING
+basic_examples_vector_scal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) $(basic_examples_vector_scal_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 else
 basic_examples_vector_scal_SOURCES +=		\
 	basic_examples/vector_scal_cpu_icc.icc
+basic_examples_vector_scal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(ICC) $(basic_examples_vector_scal_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 endif
+else
+basic_examples_vector_scal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) $(basic_examples_vector_scal_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 endif
 
 if STARPU_USE_CUDA
@@ -875,6 +880,9 @@ endif
 cpp_add_vectors_SOURCES	=	\
 	cpp/add_vectors.cpp
 
+cpp_add_vectors_interface_SOURCES	=	\
+	cpp/add_vectors_interface.cpp
+
 if STARPU_HAVE_CXX11
 cpp_add_vectors_cpp11_SOURCES	=	\
 	cpp/add_vectors_cpp11.cpp

+ 10 - 8
examples/audio/starpu_audio_processing.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -217,7 +217,7 @@ static void band_filter_kernel_gpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 
 	/* FFTW does not normalize its output ! */
 	float scal = 1.0f/nsamples;
-	cublasStatus_t status = cublasSscal (starpu_cublas_local_handle(), nsamples, &scal, localA, 1);
+	cublasStatus_t status = cublasSscal (starpu_cublas_get_local_handle(), nsamples, &scal, localA, 1);
 	if (status != CUBLAS_STATUS_SUCCESS)
 		STARPU_CUBLAS_REPORT_ERROR(status);
 }
@@ -238,7 +238,7 @@ static void band_filter_kernel_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 		plans[workerid].Acopy = malloc(nsamples*sizeof(float));
 
 		/* create plans, only "fftwf_execute" is thread safe in FFTW ... */
-		starpu_pthread_mutex_lock(&fftw_mutex);
+		STARPU_PTHREAD_MUTEX_LOCK(&fftw_mutex);
 		plans[workerid].plan_cpu = fftwf_plan_dft_r2c_1d(nsamples,
 					plans[workerid].Acopy,
 					plans[workerid].localout_cpu,
@@ -247,7 +247,7 @@ static void band_filter_kernel_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *
 					plans[workerid].localout_cpu,
 					plans[workerid].Acopy,
 					FFTW_ESTIMATE);
-		starpu_pthread_mutex_unlock(&fftw_mutex);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&fftw_mutex);
 
 		plans[workerid].is_initialized = 1;
 	}
@@ -341,6 +341,8 @@ static void init_problem(void)
 	/* read length of input WAV's data */
 	/* each element is 2 bytes long (16bits)*/
 	length_data = get_wav_data_bytes_length(infile)/2;
+	while (nsamples > length_data)
+		nsamples /= 2;
 
 	/* allocate a buffer to store the content of input file */
 	if (use_pin)
@@ -372,17 +374,17 @@ static void parse_args(int argc, char **argv)
 
 		if (strcmp(argv[i], "-i") == 0)
 		{
-			inputfilename = argv[++i];;
+			inputfilename = argv[++i];
 		}
 
 		if (strcmp(argv[i], "-o") == 0)
 		{
-			outputfilename = argv[++i];;
+			outputfilename = argv[++i];
 		}
 
 		if (strcmp(argv[i], "-no-output") == 0)
 		{
-			outputfilename = NULL;;
+			outputfilename = NULL;
 		}
 
 		/* block size */

+ 2 - 4
examples/basic_examples/variable.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2013, 2015-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -51,9 +51,6 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#ifdef STARPU_QUICK_CHECK
-	niter /= 100;
-#endif
         if (argc == 2) niter = atoi(argv[1]);
         foo = 0.0f;
 
@@ -104,6 +101,7 @@ int main(int argc, char **argv)
 	starpu_data_unregister(float_array_handle);
 
 	FPRINTF(stderr, "variable -> %f\n", foo);
+	FPRINTF(stderr, "result is %scorrect\n", foo==niter?"":"IN");
 
 	starpu_shutdown();
 

+ 7 - 0
examples/cpp/add_vectors.cpp

@@ -73,6 +73,13 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	/* StarPU can overwrite object if NUMA transfers are made */
+	if (starpu_memory_nodes_get_numa_count() > 1)
+	{
+		starpu_shutdown();
+		return 77;
+	}
+
 	// StarPU data registering
 	starpu_data_handle_t spu_vec_A;
 	starpu_data_handle_t spu_vec_B;

+ 7 - 1
examples/cpp/add_vectors_cpp11.cpp

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
- * Copyright (C) 2012 INRIA
+ * Copyright (C) 2012, 2017  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -78,6 +78,12 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	if (starpu_memory_nodes_get_numa_count() > 1)
+	{
+		starpu_shutdown();
+		return 77;
+	}
+
 	// StarPU data registering
 	starpu_data_handle_t spu_vec_A;
 	starpu_data_handle_t spu_vec_B;

+ 656 - 0
examples/cpp/add_vectors_interface.cpp

@@ -0,0 +1,656 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011, 2013-2015, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2012, 2017  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This is a small example of a C++ program using STL and starpu.  We here just
+ * add two std::vector with duplicating vectors. StarPU achieves data
+ * transfers between objects.
+ */
+
+#if defined(__GNUC__) && (__GNUC__ < 4 || __GNU_MINOR < 9)
+int main(int argc, char **argv)
+{
+	return 77;
+}
+#else
+#include <cassert>
+#include <vector>
+
+#ifdef PRINT_OUTPUT
+#include <iostream>
+#endif
+
+#include <starpu.h>
+
+#define MY_TYPE char, my_allocator<char>
+
+/* create an allocator to put data on the correct NUMA node */
+template <class T>
+class my_allocator
+{
+	public:
+
+	typedef size_t    size_type;
+	typedef ptrdiff_t difference_type;
+	typedef T*        pointer;
+	typedef const T*  const_pointer;
+	typedef T&        reference;
+	typedef const T&  const_reference;
+	typedef T         value_type;
+
+	my_allocator()
+	{
+		this->node = STARPU_MAIN_RAM;
+	}
+
+	my_allocator(const my_allocator& a)
+	{
+		node = a.get_node();
+	}
+
+	explicit my_allocator(const unsigned node)
+	{
+		this->node = node;
+	}
+
+	pointer allocate(size_type n, const void * = 0)
+	{
+		T* t = (T*) starpu_malloc_on_node(this->node, n * sizeof(T));
+		return t;
+	}
+
+	void      deallocate(void* p, size_type n)
+	{
+		if (p)
+		{
+			starpu_free_on_node(this->node, (uintptr_t) p, n * sizeof(T));
+		}
+	}
+
+	unsigned get_node() const
+	{
+		return node;
+	}
+
+	pointer address(reference x) const
+	{
+		return &x;
+	}
+
+	const_pointer address(const_reference x) const
+	{
+		return &x;
+	}
+
+	my_allocator<T>&  operator=(const my_allocator&ref)
+	{
+		node = ref.node;
+		return *this;
+	}
+
+	void construct(pointer p, const T& val)
+	{
+		new ((T*) p) T(val);
+	}
+
+	void destroy(pointer p)
+	{
+		p->~T();
+	}
+
+	size_type max_size() const
+	{
+		return size_type(-1);
+	}
+
+
+	template <class U>
+		struct rebind
+		{
+			typedef my_allocator<U> other;
+		};
+
+	template <class U>
+		explicit my_allocator(const my_allocator<U>&ref)
+		{
+			node = ref.node;
+		}
+
+	template <class U>
+		my_allocator<U>& operator=(const my_allocator<U>&ref)
+		{
+			node = ref.node;
+			return *this;
+		}
+
+	private:
+	unsigned node;
+};
+
+/*
+ * Create a new interface to catch C++ vector and make appropriate data transfers
+ */
+struct vector_cpp_interface
+{
+	enum starpu_data_interface_id id;
+
+	uintptr_t ptr;
+	uintptr_t dev_handle;
+	size_t offset;
+	uint32_t nx;
+	size_t elemsize;
+	std::vector<MY_TYPE>* vec;
+
+	uint32_t slice_base;
+};
+
+#define VECTOR_CPP_GET_VEC(interface)	({ (((struct vector_cpp_interface *)(interface))->vec); })
+
+static int vector_interface_copy_any_to_any(void *src_interface, unsigned src_node,
+                           void *dst_interface, unsigned dst_node, void *async_data);
+
+#if __cplusplus >= 201103L
+static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
+{
+
+	.can_copy = NULL,
+
+	.ram_to_ram = NULL,
+	.ram_to_cuda = NULL,
+	.ram_to_opencl = NULL,
+	.ram_to_mic = NULL,
+
+	.cuda_to_ram = NULL,
+	.cuda_to_cuda = NULL,
+	.cuda_to_opencl = NULL,
+
+	.opencl_to_ram = NULL,
+	.opencl_to_cuda = NULL,
+	.opencl_to_opencl = NULL,
+
+	.mic_to_ram = NULL,
+
+	.scc_src_to_sink = NULL,
+	.scc_sink_to_src = NULL,
+	.scc_sink_to_sink = NULL,
+
+	.ram_to_mpi_ms = NULL,
+	.mpi_ms_to_ram = NULL,
+	.mpi_ms_to_mpi_ms = NULL,
+
+	.ram_to_cuda_async = NULL,
+	.cuda_to_ram_async = NULL,
+	.cuda_to_cuda_async = NULL,
+
+	.ram_to_opencl_async = NULL,
+	.opencl_to_ram_async = NULL,
+	.opencl_to_opencl_async = NULL,
+
+	.ram_to_mpi_ms_async = NULL,
+	.mpi_ms_to_ram_async = NULL,
+	.mpi_ms_to_mpi_ms_async = NULL,
+
+	.ram_to_mic_async = NULL,
+	.mic_to_ram_async = NULL,
+
+	.any_to_any = vector_interface_copy_any_to_any,
+};
+#else
+static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
+{
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+	NULL,
+
+	NULL,
+	NULL,
+
+	vector_interface_copy_any_to_any,
+};
+#endif
+
+static void register_vector_cpp_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
+static starpu_ssize_t allocate_vector_cpp_buffer_on_node(void *data_interface_, unsigned dst_node);
+static void *vector_cpp_handle_to_pointer(starpu_data_handle_t handle, unsigned node);
+static void free_vector_cpp_buffer_on_node(void *data_interface, unsigned node);
+static void free_vector_cpp_buffer_on_node(void *data_interface, unsigned node);
+static size_t vector_cpp_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_vector_cpp_interface_crc32(starpu_data_handle_t handle);
+static int vector_cpp_compare(void *data_interface_a, void *data_interface_b);
+static void display_vector_cpp_interface(starpu_data_handle_t handle, FILE *f);
+static int pack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
+static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+static starpu_ssize_t vector_cpp_describe(void *data_interface, char *buf, size_t size);
+
+#if __cplusplus >= 201103L
+static struct starpu_data_interface_ops interface_vector_cpp_ops =
+{
+	.register_data_handle = register_vector_cpp_handle,
+	.allocate_data_on_node = allocate_vector_cpp_buffer_on_node,
+	.free_data_on_node = free_vector_cpp_buffer_on_node,
+	.copy_methods = &vector_cpp_copy_data_methods_s,
+	.handle_to_pointer = vector_cpp_handle_to_pointer,
+	.get_size = vector_cpp_interface_get_size,
+	.footprint = footprint_vector_cpp_interface_crc32,
+	.compare = vector_cpp_compare,
+	.display = display_vector_cpp_interface,
+	.describe = vector_cpp_describe,
+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
+	.interface_size = sizeof(struct vector_cpp_interface),
+	.is_multiformat = 0,
+	.dontcache = 0,
+	.get_mf_ops = NULL,
+	.pack_data = pack_vector_cpp_handle,
+	.unpack_data = unpack_vector_cpp_handle,
+	.name = (char *) "VECTOR_CPP_INTERFACE"
+};
+#else
+static struct starpu_data_interface_ops interface_vector_cpp_ops =
+{
+	register_vector_cpp_handle,
+	allocate_vector_cpp_buffer_on_node,
+	free_vector_cpp_buffer_on_node,
+	&vector_cpp_copy_data_methods_s,
+	vector_cpp_handle_to_pointer,
+	vector_cpp_interface_get_size,
+	footprint_vector_cpp_interface_crc32,
+	vector_cpp_compare,
+	display_vector_cpp_interface,
+	vector_cpp_describe,
+	STARPU_UNKNOWN_INTERFACE_ID,
+	sizeof(struct vector_cpp_interface),
+	0,
+	0,
+	NULL,
+	pack_vector_cpp_handle,
+	unpack_vector_cpp_handle,
+	(char *) "VECTOR_CPP_INTERFACE"
+};
+#endif
+
+static void *vector_cpp_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	return (void*) vector_interface->ptr;
+}
+
+static void register_vector_cpp_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *) data_interface;
+
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		struct vector_cpp_interface *local_interface = (struct vector_cpp_interface *)
+			starpu_data_get_interface_on_node(handle, node);
+
+		if (node == home_node)
+		{
+			local_interface->ptr = vector_interface->ptr;
+                        local_interface->dev_handle = vector_interface->dev_handle;
+                        local_interface->offset = vector_interface->offset;
+			local_interface->vec = vector_interface->vec;
+		}
+		else
+		{
+			local_interface->ptr = 0;
+                        local_interface->dev_handle = 0;
+                        local_interface->offset = 0;
+			local_interface->vec = NULL;
+		}
+
+		local_interface->id = vector_interface->id;
+		local_interface->nx = vector_interface->nx;
+		local_interface->elemsize = vector_interface->elemsize;
+		local_interface->slice_base = vector_interface->slice_base;
+	}
+}
+
+/* declare a new data with the vector interface */
+void vector_cpp_data_register(starpu_data_handle_t *handleptr, int home_node,
+                        std::vector<MY_TYPE>* vec, uint32_t nx, size_t elemsize)
+{
+#if __cplusplus >= 201103L
+	struct vector_cpp_interface vector =
+	{
+		.id = STARPU_UNKNOWN_INTERFACE_ID,
+		.ptr = (uintptr_t) &(*vec)[0],
+                .dev_handle = (uintptr_t) &(*vec)[0],
+                .offset = 0,
+		.nx = nx,
+		.elemsize = elemsize,
+		.vec = vec,
+		.slice_base = 0
+	};
+#else
+	struct vector_cpp_interface vector =
+	{
+		STARPU_UNKNOWN_INTERFACE_ID,
+		(uintptr_t) &(*vec)[0],
+                (uintptr_t) &(*vec)[0],
+                0,
+		nx,
+		elemsize,
+		vec,
+		0
+	};
+#endif
+
+	starpu_data_register(handleptr, home_node, &vector, &interface_vector_cpp_ops);
+}
+
+/* offer an access to the data parameters */
+uint32_t vector_cpp_get_nx(starpu_data_handle_t handle)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	return vector_interface->nx;
+}
+
+
+static uint32_t footprint_vector_cpp_interface_crc32(starpu_data_handle_t handle)
+{
+	return starpu_hash_crc32c_be(vector_cpp_get_nx(handle), 0);
+}
+
+static int vector_cpp_compare(void *data_interface_a, void *data_interface_b)
+{
+	struct vector_cpp_interface *vector_a = (struct vector_cpp_interface *) data_interface_a;
+	struct vector_cpp_interface *vector_b = (struct vector_cpp_interface *) data_interface_b;
+
+	/* Two vectors are considered compatible if they have the same size */
+	return ((vector_a->nx == vector_b->nx)
+			&& (vector_a->elemsize == vector_b->elemsize));
+}
+
+static void display_vector_cpp_interface(starpu_data_handle_t handle, FILE *f)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	fprintf(f, "%u\t", vector_interface->nx);
+}
+
+static int pack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*count = vector_interface->nx*vector_interface->elemsize;
+
+	if (ptr != NULL)
+	{
+		starpu_malloc_flags(ptr, *count, 0);
+		memcpy(*ptr, (void*)vector_interface->ptr, vector_interface->elemsize*vector_interface->nx);
+	}
+
+	return 0;
+}
+
+static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	STARPU_ASSERT(count == vector_interface->elemsize * vector_interface->nx);
+	memcpy((void*)vector_interface->ptr, ptr, count);
+
+	return 0;
+}
+
+static size_t vector_cpp_interface_get_size(starpu_data_handle_t handle)
+{
+	size_t size;
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	size = vector_interface->nx*vector_interface->elemsize;
+
+	return size;
+}
+
+size_t vector_cpp_get_elemsize(starpu_data_handle_t handle)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *)
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	return vector_interface->elemsize;
+}
+
+/* memory allocation/deallocation primitives for the vector interface */
+
+/* returns the size of the allocated area */
+static starpu_ssize_t allocate_vector_cpp_buffer_on_node(void *data_interface_, unsigned dst_node)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *) data_interface_;
+
+	uint32_t nx = vector_interface->nx;
+	size_t elemsize = vector_interface->elemsize;
+
+	starpu_ssize_t allocated_memory;
+
+	const my_allocator<char> allocator(dst_node);
+	std::vector<MY_TYPE> * vec = new std::vector<MY_TYPE>(nx, 0, allocator);
+
+	vector_interface->vec = vec;
+	if (!vector_interface->vec)
+		return -ENOMEM;
+
+	allocated_memory = nx*elemsize;
+
+	/* update the data properly in consequence */
+	vector_interface->ptr = (uintptr_t) &((*vec)[0]);
+	vector_interface->dev_handle = (uintptr_t) &((*vec)[0]);
+        vector_interface->offset = 0;
+
+	return allocated_memory;
+}
+
+static void free_vector_cpp_buffer_on_node(void *data_interface, unsigned node)
+{
+	struct vector_cpp_interface *vector_interface = (struct vector_cpp_interface *) data_interface;
+
+	delete vector_interface->vec;
+}
+
+static int vector_interface_copy_any_to_any(void *src_interface, unsigned src_node,
+                           void *dst_interface, unsigned dst_node, void *async_data)
+{
+	struct vector_cpp_interface *src_vector = (struct vector_cpp_interface *) src_interface;
+	struct vector_cpp_interface *dst_vector = (struct vector_cpp_interface *) dst_interface;
+	int ret;
+
+	ret = starpu_interface_copy(src_vector->dev_handle, src_vector->offset, src_node,
+				    dst_vector->dev_handle, dst_vector->offset, dst_node,
+				    src_vector->nx*src_vector->elemsize, async_data);
+
+	return ret;
+}
+
+static starpu_ssize_t vector_cpp_describe(void *data_interface, char *buf, size_t size)
+{
+	struct vector_cpp_interface *vector = (struct vector_cpp_interface *) data_interface;
+	return snprintf(buf, size, "V%ux%u",
+			(unsigned) vector->nx,
+			(unsigned) vector->elemsize);
+}
+
+/*
+ * End of interface
+ */
+
+
+
+/* Kernel using STL objects */
+
+void cpu_kernel_add_vectors(void *buffers[], void *cl_arg)
+{
+	std::vector<MY_TYPE>* vec_A = VECTOR_CPP_GET_VEC(buffers[0]);
+	std::vector<MY_TYPE>* vec_B = VECTOR_CPP_GET_VEC(buffers[1]);
+	std::vector<MY_TYPE>* vec_C = VECTOR_CPP_GET_VEC(buffers[2]);
+
+	// all the std::vector have to have the same size
+	assert(vec_A->size() == vec_B->size() && vec_B->size() == vec_C->size());
+
+	// performs the vector addition (vec_C[] = vec_A[] + vec_B[])
+	for (size_t i = 0; i < vec_C->size(); i++)
+		(*vec_C)[i] = (*vec_A)[i] + (*vec_B)[i];
+}
+
+#define VEC_SIZE 1024
+
+int main(int argc, char **argv)
+{
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.nmic = 0;
+	conf.nscc = 0;
+	conf.nmpi_ms = 0;
+
+	// initialize StarPU with default configuration
+	int ret = starpu_init(&conf);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Test data transfers between NUMA nodes if available */
+	unsigned last_numa_node = starpu_memory_nodes_get_numa_count() - 1;
+
+	const my_allocator<char> allocator_main_ram(STARPU_MAIN_RAM);
+	const my_allocator<char> allocator_last_numa(last_numa_node);
+	std::vector<MY_TYPE> vec_A(VEC_SIZE, 2, allocator_main_ram); // all the vector is initialized to 2
+	std::vector<MY_TYPE> vec_B(VEC_SIZE, 3, allocator_main_ram); // all the vector is initialized to 3
+	std::vector<MY_TYPE> vec_C(VEC_SIZE, 0, allocator_last_numa); // all the vector is initialized to 0
+
+	// StarPU data registering
+	starpu_data_handle_t spu_vec_A;
+	starpu_data_handle_t spu_vec_B;
+	starpu_data_handle_t spu_vec_C;
+
+	// give the data of the vector to StarPU (C array)
+	vector_cpp_data_register(&spu_vec_A, STARPU_MAIN_RAM, &vec_A, vec_A.size(), sizeof(char));
+	vector_cpp_data_register(&spu_vec_B, STARPU_MAIN_RAM, &vec_B, vec_B.size(), sizeof(char));
+	vector_cpp_data_register(&spu_vec_C, last_numa_node, &vec_C, vec_C.size(), sizeof(char));
+
+	// create the StarPU codelet
+	starpu_codelet cl;
+	starpu_codelet_init(&cl);
+	cl.cpu_funcs     [0] = cpu_kernel_add_vectors;
+	cl.cpu_funcs_name[0] = "cpu_kernel_add_vectors";
+	cl.nbuffers          = 3;
+	cl.modes         [0] = STARPU_R;
+	cl.modes         [1] = STARPU_R;
+	cl.modes         [2] = STARPU_W;
+	cl.name              = "add_vectors";
+
+	// submit a new StarPU task to execute
+	ret = starpu_task_insert(&cl,
+	                         STARPU_R, spu_vec_A,
+	                         STARPU_R, spu_vec_B,
+	                         STARPU_W, spu_vec_C,
+	                         0);
+	if (ret == -ENODEV)
+	{
+		// StarPU data unregistering
+		starpu_data_unregister(spu_vec_C);
+		starpu_data_unregister(spu_vec_B);
+		starpu_data_unregister(spu_vec_A);
+
+		// terminate StarPU, no task can be submitted after
+		starpu_shutdown();
+
+		return 77;
+	}
+
+	STARPU_CHECK_RETURN_VALUE(ret, "task_submit::add_vectors");
+
+	// wait the task
+	starpu_task_wait_for_all();
+
+	// StarPU data unregistering
+	starpu_data_unregister(spu_vec_C);
+	starpu_data_unregister(spu_vec_B);
+	starpu_data_unregister(spu_vec_A);
+
+	// terminate StarPU, no task can be submitted after
+	starpu_shutdown();
+
+	// check results
+	bool fail = false;
+	int i = 0;
+	while (!fail && i < VEC_SIZE)
+		fail = vec_C[i++] != 5;
+
+	if (fail)
+	{
+#ifdef PRINT_OUTPUT
+		std::cout << "Example failed..." << std::endl;
+#endif
+		return EXIT_FAILURE;
+	}
+	else
+	{
+#ifdef PRINT_OUTPUT
+		std::cout << "Example successfully passed!" << std::endl;
+#endif
+		return EXIT_SUCCESS;
+	}
+}
+#endif

+ 8 - 8
examples/heat/dw_factolu_kernels.c

@@ -64,8 +64,8 @@ void display_stat_heat(void)
 	{
 		if (count_total_per_worker[worker])
 		{
-			char name[32];
-			starpu_worker_get_name(worker, name, 32);
+			char name[64];
+			starpu_worker_get_name(worker, name, sizeof(name));
 
 			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
 		}
@@ -76,8 +76,8 @@ void display_stat_heat(void)
 	{
 		if (count_total_per_worker[worker])
 		{
-			char name[32];
-			starpu_worker_get_name(worker, name, 32);
+			char name[64];
+			starpu_worker_get_name(worker, name, sizeof(name));
 
 			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
 		}
@@ -89,8 +89,8 @@ void display_stat_heat(void)
 	{
 		if (count_total_per_worker[worker])
 		{
-			char name[32];
-			starpu_worker_get_name(worker, name, 32);
+			char name[64];
+			starpu_worker_get_name(worker, name, sizeof(name));
 
 			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
 		}
@@ -101,8 +101,8 @@ void display_stat_heat(void)
 	{
 		if (count_total_per_worker[worker])
 		{
-			char name[32];
-			starpu_worker_get_name(worker, name, 32);
+			char name[64];
+			starpu_worker_get_name(worker, name, sizeof(name));
 
 			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
 		}

+ 21 - 10
examples/heat/dw_sparse_cg.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011, 2015, 2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -127,21 +127,24 @@ void init_problem(void)
  *	cg initialization phase
  */
 
-static struct starpu_codelet cl1 = {
+static struct starpu_codelet cl1 =
+{
 	.cpu_funcs = { cpu_codelet_func_1 },
 	.cpu_funcs_name = { "cpu_codelet_func_1" },
 	.nbuffers = 4,
 	.modes = { STARPU_R, STARPU_R, STARPU_W, STARPU_R },
 };
 
-static struct starpu_codelet cl2 = {
+static struct starpu_codelet cl2 =
+{
 	.cpu_funcs = { cpu_codelet_func_2 },
 	.cpu_funcs_name = { "cpu_codelet_func_2" },
 	.nbuffers = 2,
 	.modes = { STARPU_W, STARPU_R },
 };
 
-static struct starpu_codelet cl3 = {
+static struct starpu_codelet cl3 =
+{
 	.cpu_funcs = { cpu_codelet_func_3 },
 	.cpu_funcs_name = { "cpu_codelet_func_3" },
 #ifdef STARPU_USE_CUDA
@@ -205,14 +208,16 @@ void init_cg(struct cg_problem *problem)
  *		the codelet code launcher is its own callback !
  */
 
-static struct starpu_codelet cl4 = {
+static struct starpu_codelet cl4 =
+{
 	.cpu_funcs = { cpu_codelet_func_4 },
 	.cpu_funcs_name = { "cpu_codelet_func_4" },
 	.nbuffers = 3,
 	.modes = { STARPU_R, STARPU_R, STARPU_W },
 };
 
-static struct starpu_codelet cl5 = {
+static struct starpu_codelet cl5 =
+{
 	.cpu_funcs = { cpu_codelet_func_5 },
 	.cpu_funcs_name = { "cpu_codelet_func_5" },
 #ifdef STARPU_USE_CUDA
@@ -222,7 +227,8 @@ static struct starpu_codelet cl5 = {
 	.modes = { STARPU_R, STARPU_R },
 };
 
-static struct starpu_codelet cl6 = {
+static struct starpu_codelet cl6 =
+{
 	.cpu_funcs = { cpu_codelet_func_6 },
 	.cpu_funcs_name = { "cpu_codelet_func_6" },
 #ifdef STARPU_USE_CUDA
@@ -233,7 +239,8 @@ static struct starpu_codelet cl6 = {
 	.modes = { STARPU_RW, STARPU_R },
 };
 
-static struct starpu_codelet cl7 = {
+static struct starpu_codelet cl7 =
+{
 	.cpu_funcs = { cpu_codelet_func_7 },
 	.cpu_funcs_name = { "cpu_codelet_func_7" },
 #ifdef STARPU_USE_CUDA
@@ -244,7 +251,8 @@ static struct starpu_codelet cl7 = {
 	.modes = { STARPU_RW, STARPU_R },
 };
 
-static struct starpu_codelet cl8 = {
+static struct starpu_codelet cl8 =
+{
 	.cpu_funcs = { cpu_codelet_func_8 },
 	.cpu_funcs_name = { "cpu_codelet_func_8" },
 #ifdef STARPU_USE_CUDA
@@ -254,7 +262,8 @@ static struct starpu_codelet cl8 = {
 	.modes = { STARPU_R },
 };
 
-static struct starpu_codelet cl9 = {
+static struct starpu_codelet cl9 =
+{
 	.cpu_funcs = { cpu_codelet_func_9 },
 	.cpu_funcs_name = { "cpu_codelet_func_9" },
 #ifdef STARPU_USE_CUDA
@@ -439,6 +448,8 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 	sem_wait(&sem);
 	sem_destroy(&sem);
 
+	starpu_task_wait_for_all();
+
 	print_results(vecx, nrow);
 
 	starpu_data_unregister(ds_matrixA);

+ 1 - 1
examples/lu/lu.sh

@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 #
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #

+ 3 - 0
examples/lu/lu_example.c

@@ -189,7 +189,10 @@ static void init_matrix(void)
 			A[i + j*size] += (TYPE)(I*starpu_drand48());
 #endif
 			if (i == j)
+			{
+				A[i + j*size] += 1;
 				A[i + j*size] *= 100;
+			}
 		}
 	}
 #endif

+ 20 - 17
examples/mlr/mlr.c

@@ -151,25 +151,28 @@ static struct starpu_codelet cl_final =
 int main(int argc, char **argv)
 {
 	/* Initialization */
-	unsigned i,j;
+	unsigned i;
 	int ret;
+
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		return 77;
 
 	sum=0;
-	int m,n,k;
-	int* vector_mn = calloc( 2, sizeof(int) );
+	int* vector_mn = calloc(2, sizeof(int));
 	starpu_data_handle_t vector_mn_handle;
 
-	starpu_vector_data_register( &vector_mn_handle,
-				     STARPU_MAIN_RAM,
-				     (uintptr_t)vector_mn, 2,
-				     sizeof(int) );
+	starpu_vector_data_register(&vector_mn_handle,
+				    STARPU_MAIN_RAM,
+				    (uintptr_t)vector_mn, 2,
+				    sizeof(int));
 
 	/* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
-	for ( i = 0; i < 42; i++)
+	for (i = 0; i < 42; i++)
 	{
+		int j;
+		int m,n,k;
+
 		m = (int) ((rand() % 10)+1);
 		n = (int) ((rand() % 10)+1);
 		k = (int) ((rand() % 10)+1);
@@ -180,16 +183,16 @@ int main(int argc, char **argv)
 		vector_mn[1] = n;
 		starpu_data_release(vector_mn_handle);
 
-		for ( j = 0; j < 42; j++)
+		for (j = 0; j < 42; j++)
 		{
-			starpu_insert_task( &cl_init,
-					    STARPU_R, vector_mn_handle,
-					    STARPU_VALUE, &k, sizeof(int),
-					    0 );
-			starpu_insert_task( &cl_final,
-					    STARPU_R, vector_mn_handle,
-					    STARPU_VALUE, &k, sizeof(int),
-					    0 );
+			starpu_insert_task(&cl_init,
+					   STARPU_R, vector_mn_handle,
+					   STARPU_VALUE, &k, sizeof(int),
+					   0);
+			starpu_insert_task(&cl_final,
+					   STARPU_R, vector_mn_handle,
+					   STARPU_VALUE, &k, sizeof(int),
+					   0);
 		}
 	}
 

+ 3 - 3
examples/ppm_downscaler/yuv_downscaler.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2011, 2013-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -48,8 +48,8 @@ void parse_args(int argc, char **argv)
 	}
 	else
 	{
-		snprintf(filename_in, 1024, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_in_default);
-		snprintf(filename_out, 1024, "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_out_default);
+		snprintf(filename_in, sizeof(filename_in), "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_in_default);
+		snprintf(filename_out, sizeof(filename_out), "%s/examples/ppm_downscaler/%s", STARPU_BUILD_DIR, filename_out_default);
 	}
 }
 

+ 5 - 1
examples/sched_ctx/parallel_tasks_with_cluster_api.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015 Université de Bordeaux
+ * Copyright (C) 2015, 2017 Université de Bordeaux
  * Copyright (C) 2015 INRIA
  * Copyright (C) 2015 CNRS
  *
@@ -61,6 +61,10 @@ int main(int argc, char **argv)
 	int ret, i;
 	struct starpu_cluster_machine *clusters;
 
+	setenv("STARPU_NMIC","0",1);
+	setenv("STARPU_NSCC","0",1);
+	setenv("STARPU_NMPI_MS","0",1);
+
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		return 77;

+ 13 - 13
examples/sched_ctx_utils/sched_ctx_utils.c

@@ -75,12 +75,12 @@ void init()
 
 	p1.id = 0;
 	p2.id = 1;
-	starpu_pthread_key_create(&key, NULL);
+	STARPU_PTHREAD_KEY_CREATE(&key, NULL);
 }
 
 void update_sched_ctx_timing_results(double flops, double avg_timing)
 {
-	unsigned *id = starpu_pthread_getspecific(key);
+	unsigned *id = STARPU_PTHREAD_GETSPECIFIC(key);
 	rv[*id].flops += flops;
 	rv[*id].avg_timing += avg_timing;
 }
@@ -90,7 +90,7 @@ void* start_bench(void *val)
 	struct params *p = (struct params*)val;
 	int i;
 
-	starpu_pthread_setspecific(key, &p->id);
+	STARPU_PTHREAD_SETSPECIFIC(key, &p->id);
 
 	if(p->ctx != 0)
 		starpu_sched_ctx_set_context(&p->ctx);
@@ -100,14 +100,14 @@ void* start_bench(void *val)
 
 	if(p->ctx != 0)
 	{
-		starpu_pthread_mutex_lock(&mut);
+		STARPU_PTHREAD_MUTEX_LOCK(&mut);
 		if(first)
 		{
 			starpu_sched_ctx_delete(p->ctx);
 		}
 
 		first = 0;
-		starpu_pthread_mutex_unlock(&mut);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mut);
 	}
 
 	rv[p->id].flops /= NSAMPLES;
@@ -129,22 +129,22 @@ void start_2benchs(void (*bench)(unsigned, unsigned))
 	p2.nblocks = nblocks2;
 
 	starpu_pthread_t tid[2];
-	starpu_pthread_mutex_init(&mut, NULL);
+	STARPU_PTHREAD_MUTEX_INIT(&mut, NULL);
 
 	double start;
 	double end;
 
 	start = starpu_timing_now();
 
-	starpu_pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
-	starpu_pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
+	STARPU_PTHREAD_CREATE(&tid[0], NULL, (void*)start_bench, (void*)&p1);
+	STARPU_PTHREAD_CREATE(&tid[1], NULL, (void*)start_bench, (void*)&p2);
 
-	starpu_pthread_join(tid[0], NULL);
-	starpu_pthread_join(tid[1], NULL);
+	STARPU_PTHREAD_JOIN(tid[0], NULL);
+	STARPU_PTHREAD_JOIN(tid[1], NULL);
 
 	end = starpu_timing_now();
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = end - start;
 	timing /= 1000000;
@@ -169,7 +169,7 @@ void start_1stbench(void (*bench)(unsigned, unsigned))
 
 	end = starpu_timing_now();
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = end - start;
 	timing /= 1000000;
@@ -193,7 +193,7 @@ void start_2ndbench(void (*bench)(unsigned, unsigned))
 
 	end = starpu_timing_now();
 
-	starpu_pthread_mutex_destroy(&mut);
+	STARPU_PTHREAD_MUTEX_DESTROY(&mut);
 
 	double timing = end - start;
 	timing /= 1000000;

+ 1 - 1
examples/scheduler/schedulers.sh

@@ -29,7 +29,7 @@ then
     exit 77
 fi
 
-SCHEDULERS=`STARPU_SCHED="help" ./basic_examples/hello_world 2>&1 | awk '/\t->/ {print $1}'`
+SCHEDULERS=`STARPU_SCHED="help" ./basic_examples/hello_world 2>&1 | awk '/\t->/ {print $1}' | grep -v heteroprio`
 
 for sched in $SCHEDULERS
 do

+ 1 - 1
examples/scheduler/schedulers_context.sh

@@ -29,7 +29,7 @@ then
     exit 77
 fi
 
-SCHEDULERS=`STARPU_SCHED="help" ./basic_examples/hello_world 2>&1 | awk '/\t->/ {print $1}'`
+SCHEDULERS=`STARPU_SCHED="help" ./basic_examples/hello_world 2>&1 | awk '/\t->/ {print $1}' | grep -v pheft | grep -v peager | grep -v heteroprio`
 
 for sched in $SCHEDULERS
 do

+ 2 - 2
examples/spmv/dw_block_spmv.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2012, 2014-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -176,7 +176,6 @@ void launch_spmv_codelets(void)
 {
 	struct starpu_task *task_tab;
 	uint8_t *is_entry_tab;
-	int ret;
 
 	/* we call one codelet per block */
 	unsigned nblocks = starpu_bcsr_get_nnz(sparse_matrix);
@@ -263,6 +262,7 @@ void launch_spmv_codelets(void)
 	unsigned task;
 	for (task = 0; task < totaltasks; task++)
 	{
+		int ret;
 		if (is_entry_tab[task])
 		{
 			nchains++;

+ 2 - 2
examples/spmv/matrix_market/mmio.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2013, 2014, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -482,6 +482,6 @@ char  *mm_typecode_to_str(MM_typecode matcode)
 	else
 		return NULL;
 
-	sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+	snprintf(buffer, sizeof(buffer), "%s %s %s %s", types[0], types[1], types[2], types[3]);
 	return mm_strdup(buffer);
 }

+ 2 - 2
examples/stencil/implicit-stencil.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2010-2012, 2014  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -184,7 +184,7 @@ void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
 	{
 		if (task_per_worker[worker])
 		{
-			char name[32];
+			char name[64];
 			starpu_worker_get_name(worker, name, sizeof(name));
 			FPRINTF(stderr,"\t%s -> %u (%2.2f%%)\n", name, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
 		}

+ 2 - 2
examples/stencil/stencil.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2010-2012, 2014, 2017  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -179,7 +179,7 @@ void f(unsigned task_per_worker[STARPU_NMAXWORKERS])
 	{
 		if (task_per_worker[worker])
 		{
-			char name[32];
+			char name[64];
 			starpu_worker_get_name(worker, name, sizeof(name));
 			FPRINTF(stderr,"\t%s -> %u (%2.2f%%)\n", name, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
 		}

+ 11 - 0
include/fstarpu_mod.f90

@@ -1298,6 +1298,17 @@ module fstarpu_mod
                         type(c_ptr), value, intent(in) :: nparts
                 end subroutine fstarpu_vector_filter_block_shadow
 
+                ! void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+                subroutine fstarpu_vector_filter_list_long (father_interface,child_interface,filter,id,nparts) &
+                                bind(C,name="starpu_vector_filter_list_long")
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: father_interface
+                        type(c_ptr), value, intent(in) :: child_interface
+                        type(c_ptr), value, intent(in) :: filter
+                        type(c_ptr), value, intent(in) :: id
+                        type(c_ptr), value, intent(in) :: nparts
+                end subroutine fstarpu_vector_filter_list_long
+
                 ! void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
                 subroutine fstarpu_vector_filter_list (father_interface,child_interface,filter,id,nparts) &
                                 bind(C,name="starpu_vector_filter_list")

+ 2 - 2
include/pthread_win32/pthread.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010 Université Bordeaux
+ * Copyright (C) 2010, 2017 Université Bordeaux
  * Copyright (C) 2010, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -226,7 +226,7 @@ again:
   switch (WaitForSingleObject(*mutex, INFINITE)) {
     default:
     case WAIT_FAILED:
-      return unixErrno();;
+      return unixErrno();
     case WAIT_ABANDONED:
     case WAIT_OBJECT_0:
       return 0;

+ 1 - 1
include/schedulers/starpu_heteroprio.h

@@ -42,8 +42,8 @@ enum starpu_heteroprio_types
 	STARPU_CUDA_IDX,
 	STARPU_OPENCL_IDX,
 	STARPU_MIC_IDX,
-	STARPU_MPI_MS_IDX,
 	STARPU_SCC_IDX,
+	STARPU_MPI_MS_IDX,
 // This will be the number of archs
 	STARPU_NB_TYPES
 };

+ 2 - 2
include/starpu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2014, 2016-2017  Université de Bordeaux
- * Copyright (C) 2010-2015  CNRS
+ * Copyright (C) 2010-2015, 2017  CNRS
  * Copyright (C) 2014, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -145,8 +145,8 @@ struct starpu_conf
 int starpu_conf_init(struct starpu_conf *conf);
 
 int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
-
 int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv);
+int starpu_is_initialized(void);
 
 void starpu_pause(void);
 void starpu_resume(void);

+ 3 - 1
include/starpu_config.h.in

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
- * Copyright (C) 2014  INRIA
+ * Copyright (C) 2014, 2017  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -88,6 +88,7 @@
 #undef STARPU_MAXNODES
 #undef STARPU_NMAXBUFS
 #undef STARPU_MAXCPUS
+#undef STARPU_MAXNUMANODES
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXMICDEVS
@@ -107,6 +108,7 @@
 #undef STARPU_HAVE_SETENV
 #undef STARPU_HAVE_UNSETENV
 #undef STARPU_HAVE_UNISTD_H
+#undef STARPU_HAVE_HDF5
 
 #undef STARPU_FXT_LOCK_TRACES
 

+ 5 - 1
include/starpu_data.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
- * Copyright (C) 2016  Inria
+ * Copyright (C) 2016, 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -132,6 +132,10 @@ enum starpu_node_kind
 
 unsigned starpu_worker_get_memory_node(unsigned workerid);
 unsigned starpu_memory_nodes_get_count(void);
+int starpu_memory_nodes_get_numa_count(void);
+int starpu_memory_nodes_numa_id_to_devid(int osid);
+int starpu_memory_nodes_numa_devid_to_id(unsigned id);
+
 enum starpu_node_kind starpu_node_get_kind(unsigned node);
 
 void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask);

+ 2 - 1
include/starpu_data_filters.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2015  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
@@ -69,6 +69,7 @@ void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *ch
 
 void starpu_vector_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vector_filter_list_long(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 

+ 4 - 0
include/starpu_data_interfaces.h

@@ -137,6 +137,8 @@ struct starpu_data_interface_ops
 
 	int (*pack_data) (starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
 	int (*unpack_data) (starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+
+	char *name;
 };
 
 int starpu_data_interface_get_next_id(void);
@@ -481,6 +483,8 @@ size_t starpu_data_get_size(starpu_data_handle_t handle);
 
 starpu_data_handle_t starpu_data_lookup(const void *ptr);
 
+int starpu_data_get_home_node(starpu_data_handle_t handle);
+
 #ifdef __cplusplus
 }
 #endif

+ 5 - 4
include/starpu_disk.h

@@ -27,7 +27,7 @@ struct starpu_disk_ops
 	 void *  (*plug)   (void *parameter, starpu_ssize_t size);
 	 void    (*unplug) (void *base);
 
-	 int    (*bandwidth)    (unsigned node);
+	 int    (*bandwidth)    (unsigned node, void *base);
 
 	 void *  (*alloc)  (void *base, size_t size);
 	 void    (*free)   (void *base, void *obj, size_t size);
@@ -38,13 +38,13 @@ struct starpu_disk_ops
 	 int     (*read)   (void *base, void *obj, void *buf, off_t offset, size_t size);
 	 int     (*write)  (void *base, void *obj, const void *buf, off_t offset, size_t size);
 
-	 int	(*full_read)    (void * base, void * obj, void ** ptr, size_t * size);
+	 int	(*full_read)    (void * base, void * obj, void ** ptr, size_t * size, unsigned dst_node);
 	 int 	(*full_write)   (void * base, void * obj, void * ptr, size_t size);
 
 	 void *  (*async_write)  (void *base, void *obj, void *buf, off_t offset, size_t size);
 	 void *  (*async_read)   (void *base, void *obj, void *buf, off_t offset, size_t size);
 
-	 void *	(*async_full_read)    (void * base, void * obj, void ** ptr, size_t * size);
+	 void *	(*async_full_read)    (void * base, void * obj, void ** ptr, size_t * size, unsigned dst_node);
 	 void *	(*async_full_write)   (void * base, void * obj, void * ptr, size_t size);
 
 	 void *  (*copy)   (void *base_src, void* obj_src, off_t offset_src,  void *base_dst, void* obj_dst, off_t offset_dst, size_t size);
@@ -57,6 +57,7 @@ struct starpu_disk_ops
 
 /* Posix functions to use disk memory */
 extern struct starpu_disk_ops starpu_disk_stdio_ops;
+extern struct starpu_disk_ops starpu_disk_hdf5_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_o_direct_ops;
 extern struct starpu_disk_ops starpu_disk_leveldb_ops;
@@ -67,7 +68,7 @@ void *starpu_disk_open(unsigned node, void *pos, size_t size);
 
 int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_ssize_t size);
 
-#define STARPU_DISK_SIZE_MIN (64*1024*1024)
+#define STARPU_DISK_SIZE_MIN (16*1024*1024)
 
 extern int starpu_disk_swap_node;
 

+ 2 - 2
include/starpu_mpi_ms.h

@@ -36,5 +36,5 @@ starpu_mpi_ms_kernel_t starpu_mpi_ms_get_kernel(starpu_mpi_ms_func_symbol_t symb
 }
 #endif
 
-#endif /* STARPU_USE_MIC */
-#endif /* __STARPU_MIC_H__ */
+#endif /* STARPU_USE_MPI_MASTER_SLAVE */
+#endif /* __STARPU_MPI_MS_H__ */

+ 2 - 0
include/starpu_perfmodel.h

@@ -170,6 +170,7 @@ struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid, unsi
 int starpu_perfmodel_get_narch_combs();
 int starpu_perfmodel_arch_comb_add(int ndevices, struct starpu_perfmodel_device* devices);
 int starpu_perfmodel_arch_comb_get(int ndevices, struct starpu_perfmodel_device *devices);
+struct starpu_perfmodel_arch *starpu_perfmodel_arch_comb_fetch(int comb);
 
 struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_arch(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned impl);
 struct starpu_perfmodel_per_arch *starpu_perfmodel_get_model_per_devices(struct starpu_perfmodel *model, int impl, ...);
@@ -182,6 +183,7 @@ char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
 void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl);
 
 double starpu_perfmodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint);
+void starpu_perfmodel_initialize(void);
 int starpu_perfmodel_list(FILE *output);
 void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);

+ 4 - 4
include/starpu_sched_component.h

@@ -46,9 +46,9 @@ struct starpu_sched_component
 	struct starpu_bitmap *workers_in_ctx;
 	void *data;
 	char *name;
-	int nchildren;
+	unsigned nchildren;
 	struct starpu_sched_component **children;
-	int nparents;
+	unsigned nparents;
 	struct starpu_sched_component **parents;
 
 	void (*add_child)(struct starpu_sched_component *component, struct starpu_sched_component *child);
@@ -60,7 +60,7 @@ struct starpu_sched_component
 	struct starpu_task *(*pull_task)(struct starpu_sched_component *);
 
 	int (*can_push)(struct starpu_sched_component *component);
-	void (*can_pull)(struct starpu_sched_component *component);
+	int (*can_pull)(struct starpu_sched_component *component);
 
 	double (*estimated_load)(struct starpu_sched_component *component);
 	double (*estimated_end)(struct starpu_sched_component *component);
@@ -94,7 +94,7 @@ int starpu_sched_component_push_task(struct starpu_sched_component *from, struct
 struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx);
 struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to);
 struct starpu_task* starpu_sched_component_pump_downstream(struct starpu_sched_component *component, int* success);
-void starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component);
+int starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component);
 
 void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);

+ 0 - 3
include/starpu_scheduler.h

@@ -60,10 +60,7 @@ struct starpu_sched_policy **starpu_sched_get_predefined_policies();
 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond);
 unsigned long starpu_task_get_job_id(struct starpu_task *task);
 
-/* This function must be called to wake up a worker that is sleeping on the cond. 
- * It returns 0 whenever the worker is not in a sleeping state or has the state_keep_awake flag on */
 int starpu_wake_worker_no_relax(int workerid);
-/* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
 int starpu_wake_worker_locked(int workerid);
 
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);

+ 13 - 6
include/starpu_task.h

@@ -182,16 +182,19 @@ struct starpu_task
 	unsigned destroy:1;
 	unsigned regenerate:1;
 
-	unsigned scheduled:1;
+	unsigned mf_skip:1;
 
-	unsigned int mf_skip:1;
+	unsigned no_submitorder:1; /* do not allocate a submitorder id for this task */
 
 	unsigned workerid;
 	unsigned workerorder;
+	uint32_t *workerids;
+	unsigned workerids_len;
 
 	int priority;
 
 	enum starpu_task_status status;
+	unsigned char scheduled:1;
 
 	int magic;
 
@@ -219,15 +222,18 @@ struct starpu_task
 #endif
 };
 
+/* Note: remember to update starpu_task_init as well */
 #define STARPU_TASK_INITIALIZER 			\
 {							\
 	.cl = NULL,					\
+	.where = -1,					\
 	.cl_arg = NULL,					\
 	.cl_arg_size = 0,				\
 	.callback_func = NULL,				\
 	.callback_arg = NULL,				\
 	.priority = STARPU_DEFAULT_PRIO,		\
 	.use_tag = 0,					\
+	.sequential_consistency = 1,			\
 	.synchronous = 0,				\
 	.execute_on_a_specific_worker = 0,		\
 	.workerorder = 0,				\
@@ -237,15 +243,16 @@ struct starpu_task
 	.regenerate = 0,				\
 	.status = STARPU_TASK_INVALID,			\
 	.profiling_info = NULL,				\
-	.predicted = -1.0,				\
-	.predicted_transfer = -1.0,			\
+	.predicted = NAN,				\
+	.predicted_transfer = NAN,			\
+	.predicted_start = NAN,				\
 	.starpu_private = NULL,				\
 	.magic = 42,                  			\
-	.sched_ctx = 0,					\
+	.sched_ctx = STARPU_NMAX_SCHED_CTXS,		\
 	.hypervisor_tag = 0,				\
 	.flops = 0.0,					\
 	.scheduled = 0,					\
-	.prefetched = 0,					\
+	.prefetched = 0,				\
 	.dyn_handles = NULL,				\
 	.dyn_interfaces = NULL,				\
 	.dyn_modes = NULL,				\

+ 11 - 1
include/starpu_thread_util.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012-2014, 2016-2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,6 +47,16 @@
 	}									\
 } while (0)
 
+#define STARPU_PTHREAD_JOIN(thread, retval) do {		    	\
+	int p_ret =  starpu_pthread_join((thread), (retval)); \
+	if (STARPU_UNLIKELY(p_ret != 0)) {					\
+		fprintf(stderr,							\
+			"%s:%d starpu_pthread_join: %s\n",			\
+			__FILE__, __LINE__, strerror(p_ret));			\
+		STARPU_ABORT();							\
+	}									\
+} while (0)
+
 /*
  * Encapsulation of the starpu_pthread_mutex_* functions.
  */

+ 35 - 4
include/starpu_util.h

@@ -159,13 +159,44 @@ extern "C"
 } while(0)
 
 #if defined(STARPU_HAVE_STRERROR_R)
+#if (! defined(__GLIBC__) || !__GLIBC__) || ((_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) && (! defined(_GNU_SOURCE)))
+/* XSI-compliant version of strerror_r returns an int */
+#define starpu_strerror_r(errnum, buf, buflen) \
+	do \
+	{ \
+		int _ret = strerror_r((errnum), (buf), (buflen)); \
+		STARPU_ASSERT(_ret == 0); \
+	} \
+	while (0)
+#else
+/* GNU-specific version of strerror_r returns a char * */
+#define starpu_strerror_r(errnum, buf, buflen) \
+	do \
+	{ \
+		char * const _user_buf = (buf); \
+		const size_t _user_buflen = (buflen); \
+		/* the GNU-specific behaviour when 'buf' == NULL cannot be emulated with the XSI-compliant version */ \
+		STARPU_ASSERT((buf) != NULL); \
+		char * _tmp_buf = strerror_r((errnum), _user_buf, _user_buflen); \
+		if (_tmp_buf != _user_buf) \
+		{ \
+			if (_user_buflen > 0) \
+			{ \
+				strncpy(_user_buf, _tmp_buf, _user_buflen); \
+				_user_buf[_user_buflen-1] = '\0'; \
+			} \
+		} \
+	} \
+	while (0)
+#endif /* strerror_r ABI version */
+
 #  define STARPU_CHECK_RETURN_VALUE(err, message, ...) {if (STARPU_UNLIKELY(err != 0)) { \
-			char xmessage[256]; char *_strerror = strerror_r(-err, xmessage, 256); \
-			fprintf(stderr, "[starpu] Unexpected value: <%d:%s> returned for " message "\n", err, _strerror==NULL?"":xmessage, ## __VA_ARGS__); \
+			char xmessage[256]; starpu_strerror_r(-err, xmessage, 256); \
+			fprintf(stderr, "[starpu] Unexpected value: <%d:%s> returned for " message "\n", err, xmessage, ## __VA_ARGS__); \
 			STARPU_ABORT(); }}
 #  define STARPU_CHECK_RETURN_VALUE_IS(err, value, message, ...) {if (STARPU_UNLIKELY(err != value)) { \
-			char xmessage[256]; char *_strerror=strerror_r(-err, xmessage, 256); \
-			fprintf(stderr, "[starpu] Unexpected value: <%d!=%d:%s> returned for " message "\n", err, value, _strerror==NULL?"":xmessage, ## __VA_ARGS__); \
+			char xmessage[256]; starpu_strerror_r(-err, xmessage, 256); \
+			fprintf(stderr, "[starpu] Unexpected value: <%d!=%d:%s> returned for " message "\n", err, value, xmessage, ## __VA_ARGS__); \
 			STARPU_ABORT(); }}
 #else
 #  define STARPU_CHECK_RETURN_VALUE(err, message, ...) {if (STARPU_UNLIKELY(err != 0)) { \

+ 8 - 0
include/starpu_worker.h

@@ -25,6 +25,10 @@
 #include <starpu_thread.h>
 #include <starpu_task.h>
 
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -162,6 +166,10 @@ void starpu_worker_unlock_self(void);
 
 int starpu_wake_worker_relax(int workerid);
 
+#ifdef STARPU_HAVE_HWLOC
+hwloc_cpuset_t starpu_worker_get_hwloc_cpuset(int workerid);
+#endif
+
 #ifdef __cplusplus
 }
 #endif

+ 1 - 29
mpi/examples/Makefile.am

@@ -112,7 +112,7 @@ endif
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
@@ -124,12 +124,6 @@ examplebin_PROGRAMS +=				\
 	stencil/stencil5			\
 	stencil/stencil5_lb
 
-stencil_stencil5_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
-
-stencil_stencil5_lb_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
-
 starpu_mpi_EXAMPLES	+=	\
 	stencil/stencil5	\
 	stencil/stencil5_lb
@@ -152,7 +146,6 @@ examplebin_PROGRAMS += 			\
 	mpi_lu/plu_outofcore_example_double
 
 mpi_lu_plu_example_float_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
@@ -164,7 +157,6 @@ mpi_lu_plu_example_float_SOURCES =	\
 	../../examples/common/blas.c
 
 mpi_lu_plu_example_double_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
@@ -176,7 +168,6 @@ mpi_lu_plu_example_double_SOURCES =	\
 	../../examples/common/blas.c
 
 mpi_lu_plu_implicit_example_float_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
@@ -188,7 +179,6 @@ mpi_lu_plu_implicit_example_float_SOURCES =	\
 	../../examples/common/blas.c
 
 mpi_lu_plu_implicit_example_double_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
@@ -200,7 +190,6 @@ mpi_lu_plu_implicit_example_double_SOURCES =	\
 	../../examples/common/blas.c
 
 mpi_lu_plu_outofcore_example_float_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
@@ -212,7 +201,6 @@ mpi_lu_plu_outofcore_example_float_SOURCES =	\
 	../../examples/common/blas.c
 
 mpi_lu_plu_outofcore_example_double_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_LIBNUMA_LDFLAGS)				\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
@@ -245,7 +233,6 @@ matrix_decomposition_mpi_cholesky_SOURCES	=		\
 	../../examples/common/blas.c
 
 matrix_decomposition_mpi_cholesky_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
 matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
@@ -258,7 +245,6 @@ matrix_decomposition_mpi_cholesky_distributed_SOURCES =	\
 	../../examples/common/blas.c
 
 matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
 if !STARPU_SIMGRID
@@ -281,7 +267,6 @@ matrix_mult_mm_SOURCES	=		\
 	matrix_mult/mm.c
 
 matrix_mult_mm_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	-lm
 
 if !STARPU_SIMGRID
@@ -308,7 +293,6 @@ native_fortran_nf_mm_SOURCES	=			\
 	native_fortran/nf_mm.f90
 
 native_fortran_nf_mm_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	-lm
 
 native_fortran_nf_basic_ring_SOURCES	=			\
@@ -317,7 +301,6 @@ native_fortran_nf_basic_ring_SOURCES	=			\
 	native_fortran/nf_basic_ring.f90
 
 native_fortran_nf_basic_ring_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	-lm
 
 if !STARPU_SIMGRID
@@ -341,9 +324,6 @@ complex_mpi_complex_SOURCES =		\
 	complex/mpi_complex.c		\
 	../../examples/interface/complex_interface.c
 
-complex_mpi_complex_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-
 starpu_mpi_EXAMPLES	+=			\
 	complex/mpi_complex
 endif
@@ -360,9 +340,6 @@ user_datatype_user_datatype_SOURCES =		\
 	user_datatype/user_datatype.c		\
 	user_datatype/my_interface.c
 
-user_datatype_user_datatype_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 	user_datatype/user_datatype
@@ -378,11 +355,6 @@ examplebin_PROGRAMS +=			\
 	comm/comm			\
 	comm/mix_comm
 
-comm_comm_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-comm_mix_comm_LDADD =		\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 	comm/comm				\

+ 2 - 2
mpi/examples/complex/mpi_complex.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012, 2013, 2015, 2016  CNRS
+ * Copyright (C) 2012, 2013, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -122,5 +122,5 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-	if (rank == 0) return !compare; else return 0;
+	return (rank == 0) ? !compare : 0;
 }

+ 3 - 2
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2015-2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -90,7 +90,8 @@ void parse_args(int argc, char **argv, int nodes)
                 }
         }
 
-        if (nblocks > size) nblocks = size;
+        if (nblocks > size)
+		nblocks = size;
 
 	if (dblockx == -1 || dblocky == -1)
 	{

+ 1 - 1
mpi/examples/mpi_lu/pxlu.c

@@ -302,7 +302,7 @@ static void create_task_11(unsigned k)
 	if (get_block_rank(k, k) == rank)
 	{
 #ifdef VERBOSE_INIT
-		fprintf(stderr, "CREATE real task 11(%u) (TAG11_SAVE(%u) = %lx) on node %d\n", k, k, TAG11_SAVE(k), rank);
+		fprintf(stderr, "CREATE real task 11(%u) (TAG11_SAVE(%u) = %llux) on node %d\n", k, k, (unsigned long long) TAG11_SAVE(k), rank);
 #endif
 		create_task_11_real(k);
 	}

+ 3 - 2
mpi/examples/mpi_lu/pxlu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2014  Université de Bordeaux
- * Copyright (C) 2010, 2012, 2014  CNRS
+ * Copyright (C) 2010, 2012, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,7 +31,8 @@
 //#define SINGLE_TMP11	1
 //#define SINGLE_TMP1221	1
 
-struct debug_info {
+struct debug_info
+{
 	unsigned i;
 	unsigned j;
 	unsigned k;

+ 15 - 15
mpi/examples/user_datatype/my_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015, 2016  CNRS
+ * Copyright (C) 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -69,7 +69,7 @@ void _starpu_my_interface_datatype_allocate(MPI_Datatype *mpi_datatype)
 
 void starpu_my_interface_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype)
 {
-	(handle);
+	(void)handle;
 	_starpu_my_interface_datatype_allocate(mpi_datatype);
 }
 
@@ -120,20 +120,20 @@ static void data_register_data_handle(starpu_data_handle_t handle, unsigned home
 
 static starpu_ssize_t data_allocate_data_on_node(void *data_interface, unsigned node)
 {
-	(data_interface);
-	(node);
+	(void)data_interface;
+	(void)node;
 	return 0;
 }
 
 static void data_free_data_on_node(void *data_interface, unsigned node)
 {
-	(data_interface);
-	(node);
+	(void)data_interface;
+	(void)node;
 }
 
 static size_t data_get_size(starpu_data_handle_t handle)
 {
-	(handle);
+	(void)handle;
 	return sizeof(int) + sizeof(char);
 }
 
@@ -144,20 +144,20 @@ static uint32_t data_footprint(starpu_data_handle_t handle)
 
 static int data_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
 {
-	(handle);
-	(node);
-	(ptr);
-	(count);
+	(void)handle;
+	(void)node;
+	(void)ptr;
+	(void)count;
 	STARPU_ASSERT_MSG(0, "The data interface has been registered with starpu_mpi_datatype_register(). Calling the pack_data function should not happen\n");
 	return 0;
 }
 
 static int data_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
 {
-	(handle);
-	(node);
-	(ptr);
-	(count);
+	(void)handle;
+	(void)node;
+	(void)ptr;
+	(void)count;
 	STARPU_ASSERT_MSG(0, "The data interface has been registered with starpu_mpi_datatype_register(). Calling the unpack_data function should not happen\n");
 	return 0;
 }

+ 2 - 2
mpi/examples/user_datatype/user_datatype.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015, 2016  CNRS
+ * Copyright (C) 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -109,5 +109,5 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-	if (rank == 0) return !compare; else return 0;
+	return (rank == 0) ? !compare : 0;
 }

+ 105 - 0
mpi/include/fstarpu_mpi_mod.f90

@@ -1,6 +1,7 @@
 ! StarPU --- Runtime system for heterogeneous multicore architectures.
 !
 ! Copyright (C) 2016  Inria
+! Copyright (C) 2017  Université de Bordeaux
 !
 ! StarPU is free software; you can redistribute it and/or modify
 ! it under the terms of the GNU Lesser General Public License as published by
@@ -32,6 +33,20 @@ module fstarpu_mpi_mod
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_isend
 
+                ! == mpi/include/starpu_mpi.h ==
+                ! int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, int prio, MPI_Comm comm);
+                function fstarpu_mpi_isend_prio (dh, mpi_req, dst, mpi_tag, prio, mpi_comm) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int) :: fstarpu_mpi_isend_prio
+                        type(c_ptr), value, intent(in) :: dh
+                        type(c_ptr), value, intent(in) :: mpi_req
+                        integer(c_int), value, intent(in) :: dst
+                        integer(c_int), value, intent(in) :: mpi_tag
+                        integer(c_int), value, intent(in) :: prio
+                        integer(c_int), value, intent(in) :: mpi_comm
+                end function fstarpu_mpi_isend_prio
+
                 ! int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm);
                 function fstarpu_mpi_irecv (dh, mpi_req, src, mpi_tag, mpi_comm) bind(C)
                         use iso_c_binding
@@ -55,6 +70,18 @@ module fstarpu_mpi_mod
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_send
 
+                ! int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm);
+                function fstarpu_mpi_send_prio (dh, dst, mpi_tag, prio, mpi_comm) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int) :: fstarpu_mpi_send_prio
+                        type(c_ptr), value, intent(in) :: dh
+                        integer(c_int), value, intent(in) :: dst
+                        integer(c_int), value, intent(in) :: mpi_tag
+                        integer(c_int), value, intent(in) :: prio
+                        integer(c_int), value, intent(in) :: mpi_comm
+                end function fstarpu_mpi_send_prio
+
                 ! int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
                 function fstarpu_mpi_recv (dh, src, mpi_tag, mpi_comm, mpi_status) bind(C)
                         use iso_c_binding
@@ -80,6 +107,20 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: arg
                 end function fstarpu_mpi_isend_detached
 
+                ! int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
+                function fstarpu_mpi_isend_detached_prio (dh, dst, mpi_tag, prio, mpi_comm, callback, arg) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int) :: fstarpu_mpi_isend_detached_prio
+                        type(c_ptr), value, intent(in) :: dh
+                        integer(c_int), value, intent(in) :: dst
+                        integer(c_int), value, intent(in) :: mpi_tag
+                        integer(c_int), value, intent(in) :: prio
+                        integer(c_int), value, intent(in) :: mpi_comm
+                        type(c_funptr), value, intent(in) :: callback
+                        type(c_ptr), value, intent(in) :: arg
+                end function fstarpu_mpi_isend_detached_prio
+
                 ! int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
                 function fstarpu_mpi_recv_detached (dh, src, mpi_tag, mpi_comm, callback, arg) bind(C)
                         use iso_c_binding
@@ -105,6 +146,19 @@ module fstarpu_mpi_mod
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_issend
 
+                ! int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, int prio, MPI_Comm comm);
+                function fstarpu_mpi_issend_prio (dh, mpi_req, dst, mpi_tag, prio, mpi_comm) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int) :: fstarpu_mpi_issend_prio
+                        type(c_ptr), value, intent(in) :: dh
+                        type(c_ptr), value, intent(in) :: mpi_req
+                        integer(c_int), value, intent(in) :: dst
+                        integer(c_int), value, intent(in) :: mpi_tag
+                        integer(c_int), value, intent(in) :: prio
+                        integer(c_int), value, intent(in) :: mpi_comm
+                end function fstarpu_mpi_issend_prio
+
                 ! int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
                 function fstarpu_mpi_issend_detached (dh, dst, mpi_tag, mpi_comm, callback, arg) bind(C)
                         use iso_c_binding
@@ -118,6 +172,20 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: arg
                 end function fstarpu_mpi_issend_detached
 
+                ! int starpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
+                function fstarpu_mpi_issend_detached_prio (dh, dst, mpi_tag, prio, mpi_comm, callback, arg) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int) :: fstarpu_mpi_issend_detached_prio
+                        type(c_ptr), value, intent(in) :: dh
+                        integer(c_int), value, intent(in) :: dst
+                        integer(c_int), value, intent(in) :: mpi_tag
+                        integer(c_int), value, intent(in) :: prio
+                        integer(c_int), value, intent(in) :: mpi_comm
+                        type(c_funptr), value, intent(in) :: callback
+                        type(c_ptr), value, intent(in) :: arg
+                end function fstarpu_mpi_issend_detached_prio
+
                 ! int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
                 function fstarpu_mpi_wait(req,st) bind(C,name="starpu_mpi_wait")
                         use iso_c_binding
@@ -235,6 +303,15 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: dh
                 end subroutine fstarpu_mpi_redux_data
 
+                ! void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio);
+                subroutine fstarpu_mpi_redux_data_prio(mpi_comm,dh, prio) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int), value, intent(in) :: mpi_comm
+                        type(c_ptr), value, intent(in) :: dh
+                        integer(c_int), value, intent(in) :: prio
+                end subroutine fstarpu_mpi_redux_data_prio
+
                 ! int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
                 function fstarpu_mpi_scatter_detached (dhs, cnt, root, mpi_comm, scallback, sarg, rcallback, rarg) bind(C)
                         use iso_c_binding
@@ -278,6 +355,19 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: starpu_tag
                 end function fstarpu_mpi_isend_detached_unlock_tag
 
+                ! int starpu_mpi_isend_detached_unlock_tag_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm, starpu_tag_t tag);
+                function fstarpu_mpi_isend_detached_unlock_tag_prio (dh, dst, mpi_tag, prio, mpi_comm, starpu_tag) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int) :: fstarpu_mpi_isend_detached_unlock_tag_prio
+                        type(c_ptr), value, intent(in) :: dh
+                        integer(c_int), value, intent(in) :: dst
+                        integer(c_int), value, intent(in) :: mpi_tag
+                        integer(c_int), value, intent(in) :: prio
+                        integer(c_int), value, intent(in) :: mpi_comm
+                        type(c_ptr), value, intent(in) :: starpu_tag
+                end function fstarpu_mpi_isend_detached_unlock_tag_prio
+
                 ! int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
                 function fstarpu_mpi_recv_detached_unlock_tag (dh, src, mpi_tag, mpi_comm, starpu_tag) bind(C)
                         use iso_c_binding
@@ -304,6 +394,21 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: starpu_tag
                 end function fstarpu_mpi_isend_array_detached_unlock_tag
 
+                ! int starpu_mpi_isend_array_detached_unlock_tag_prio(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, int *prio, MPI_Comm *comm, starpu_tag_t tag);
+                function fstarpu_mpi_isend_array_detached_unlock_tag_prio (array_size, dhs, dsts, mpi_tags, prio, mpi_comms, &
+                                starpu_tag) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int) :: fstarpu_mpi_isend_array_detached_unlock_tag_prio
+                        integer(c_int), value, intent(in) :: array_size
+                        type(c_ptr), intent(in) :: dhs(*)
+                        integer(c_int), intent(in) :: dsts(*)
+                        integer(c_int), intent(in) :: mpi_tags(*)
+                        integer(c_int), intent(in) :: prio(*)
+                        integer(c_int), intent(in) :: mpi_comms(*)
+                        type(c_ptr), value, intent(in) :: starpu_tag
+                end function fstarpu_mpi_isend_array_detached_unlock_tag_prio
+
                 ! int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
                 function fstarpu_mpi_recv_array_detached_unlock_tag (array_size, dhs, srcs, mpi_tags, mpi_comms, starpu_tag) &
                                 bind(C)

+ 11 - 0
mpi/include/starpu_mpi.h

@@ -33,13 +33,18 @@ extern "C"
 typedef void *starpu_mpi_req;
 
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, int prio, MPI_Comm comm);
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm);
 int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm);
 int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
 int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
 int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, int prio, MPI_Comm comm);
 int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_barrier(MPI_Comm comm);
@@ -60,15 +65,19 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
 
 void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
+void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle);
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
+void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio);
 
 int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
 int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
 
 int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
+int starpu_mpi_isend_detached_unlock_tag_prio(starpu_data_handle_t data_handle, int dest, int mpi_tag, int prio, MPI_Comm comm, starpu_tag_t tag);
 int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
 
 int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
+int starpu_mpi_isend_array_detached_unlock_tag_prio(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, int *prio, MPI_Comm *comm, starpu_tag_t tag);
 int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
 
 void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
@@ -90,6 +99,8 @@ void starpu_mpi_set_communication_tag(int tag);
 void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, int tag, int rank, MPI_Comm comm);
 #define starpu_mpi_data_register(data_handle, tag, rank) starpu_mpi_data_register_comm(data_handle, tag, rank, MPI_COMM_WORLD)
 
+#define STARPU_MPI_PER_NODE -2
+
 void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm);
 #define starpu_mpi_data_set_rank(handle, rank) starpu_mpi_data_set_rank_comm(handle, rank, MPI_COMM_WORLD)
 void starpu_mpi_data_set_tag(starpu_data_handle_t handle, int tag);

+ 2 - 2
mpi/src/load_balancer/policy/load_data_interface.c

@@ -124,7 +124,7 @@ int load_data_wakeup_cond(starpu_data_handle_t handle)
 	struct load_data_interface *ld_interface =
 		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
-	return ((ld_interface->wakeup_task_threshold > 0) && (ld_interface->nfinished_tasks == ld_interface->wakeup_task_threshold));
+	return (ld_interface->wakeup_task_threshold > 0) && (ld_interface->nfinished_tasks == ld_interface->wakeup_task_threshold);
 }
 
 static void load_data_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
@@ -166,7 +166,7 @@ static void load_data_free_data_on_node(void *data_interface, unsigned node)
 static size_t load_data_get_size(starpu_data_handle_t handle)
 {
 	(void) handle;
-	return (sizeof(struct load_data_interface));
+	return sizeof(struct load_data_interface);
 }
 
 static uint32_t load_data_footprint(starpu_data_handle_t handle)

+ 5 - 5
mpi/src/load_balancer/policy/load_heat_propagation.c

@@ -27,12 +27,12 @@
 
 static int TAG_LOAD(int n)
 {
-	return ((n+1) << 24);
+	return (n+1) << 24;
 }
 
 static int TAG_MOV(int n)
 {
-	return ((n+1) << 20);
+	return (n+1) << 20;
 }
 
 /* Hash table of local pieces of data that has been moved out of the local MPI
@@ -96,14 +96,14 @@ static void balance(starpu_data_handle_t load_data_cpy)
 {
 	int less_loaded = -1;
 	int n;
-	double elapsed_time, ref_elapsed_time;
+	double ref_elapsed_time;
 	double my_elapsed_time = load_data_get_elapsed_time(load_data_cpy);
 
 	/* Search for the less loaded neighbor */
 	ref_elapsed_time = my_elapsed_time;
 	for (n = 0; n < nneighbors; n++)
 	{
-		elapsed_time = load_data_get_elapsed_time(neighbor_load_data_handles[n]);
+		double elapsed_time = load_data_get_elapsed_time(neighbor_load_data_handles[n]);
 		if (ref_elapsed_time > elapsed_time)
 		{
 			//fprintf(stderr,"Node%d: ref local time %lf vs neighbour%d time %lf\n", my_rank, ref_elapsed_time, neighbor_ids[n], elapsed_time);
@@ -418,7 +418,7 @@ static int init_heat(struct starpu_mpi_lb_conf *itf)
 	}
 
 	_STARPU_MPI_MALLOC(user_itf, sizeof(struct starpu_mpi_lb_conf));
-	memcpy(user_itf, itf, sizeof(struct starpu_mpi_lb_conf));;
+	memcpy(user_itf, itf, sizeof(struct starpu_mpi_lb_conf));
 
 	/* Get the neighbors of the local MPI node */
 	user_itf->get_neighbors(&neighbor_ids, &nneighbors);

+ 175 - 49
mpi/src/starpu_mpi.c

@@ -17,6 +17,7 @@
  */
 
 #include <stdlib.h>
+#include <limits.h>
 #include <starpu_mpi.h>
 #include <starpu_mpi_datatype.h>
 #include <starpu_mpi_private.h>
@@ -37,9 +38,16 @@
 #include <datawizard/coherency.h>
 #include <core/simgrid.h>
 #include <core/task.h>
+#include <core/topology.h>
+#include <core/workers.h>
 
 /* Number of ready requests to process before polling for completed requests */
-#define NREADY_PROCESS 10
+static unsigned nready_process;
+
+/* Number of send requests to submit to MPI at the same time */
+static unsigned ndetached_send;
+
+static int mpi_thread_cpuid = -1;
 
 static void _starpu_mpi_add_sync_point_in_fxt(void);
 static void _starpu_mpi_submit_ready_request(void *arg);
@@ -50,7 +58,7 @@ static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type
 #endif
 static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
 							int dest, int data_tag, MPI_Comm comm,
-							unsigned detached, unsigned sync, void (*callback)(void *), void *arg,
+							unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
 							int sequential_consistency);
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle,
 							int source, int data_tag, MPI_Comm comm,
@@ -61,10 +69,12 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 static void _starpu_mpi_early_data_cb(void* arg);
 
 /* The list of ready requests */
-static struct _starpu_mpi_req_list *ready_requests;
+static struct _starpu_mpi_req_list ready_recv_requests;
+static struct _starpu_mpi_req_prio_list ready_send_requests;
 
 /* The list of detached requests that have already been submitted to MPI */
-static struct _starpu_mpi_req_list *detached_requests;
+static struct _starpu_mpi_req_list detached_requests;
+static unsigned detached_send_nrequests;
 static starpu_pthread_mutex_t detached_requests_mutex;
 
 /* Condition to wake up progression thread */
@@ -105,6 +115,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
 	/* Initialize the request structure */
 	(*req)->data_handle = NULL;
+	(*req)->prio = 0;
 
 	(*req)->datatype = 0;
 	(*req)->datatype_name = NULL;
@@ -144,6 +155,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	(*req)->size_req = 0;
 	(*req)->internal_req = NULL;
 	(*req)->is_internal_req = 0;
+	(*req)->to_destroy = 1;
 	(*req)->early_data_handle = NULL;
 	(*req)->envelope = NULL;
 	(*req)->sequential_consistency = 1;
@@ -222,7 +234,7 @@ static void _starpu_mpi_submit_ready_request(void *arg)
 			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 					  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr,
 					  req->datatype_name, (int)req->count, req->registered_datatype);
-			_starpu_mpi_req_list_push_front(ready_requests, req);
+			_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 
 			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
 			STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
@@ -288,7 +300,7 @@ static void _starpu_mpi_submit_ready_request(void *arg)
 						STARPU_ASSERT(req->count);
 						_STARPU_MPI_MALLOC(req->ptr, req->count);
 					}
-					_starpu_mpi_req_list_push_front(ready_requests, req);
+					_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 					_starpu_mpi_request_destroy(sync_req);
 				}
 				else
@@ -301,7 +313,10 @@ static void _starpu_mpi_submit_ready_request(void *arg)
 	}
 	else
 	{
-		_starpu_mpi_req_list_push_front(ready_requests, req);
+		if (req->request_type == SEND_REQ)
+			_starpu_mpi_req_prio_list_push_front(&ready_send_requests, req);
+		else
+			_starpu_mpi_req_list_push_front(&ready_recv_requests, req);
 		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
 				  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr,
 				  req->datatype_name, (int)req->count, req->registered_datatype);
@@ -323,7 +338,7 @@ static void nop_acquire_cb(void *arg)
 
 static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle_t data_handle,
 							      int srcdst, int data_tag, MPI_Comm comm,
-							      unsigned detached, unsigned sync, void (*callback)(void *), void *arg,
+							      unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
 							      enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
 							      enum starpu_data_access_mode mode,
 							      int sequential_consistency,
@@ -347,6 +362,8 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 	/* Initialize the request structure */
 	_starpu_mpi_request_init(&req);
 	req->request_type = request_type;
+	/* prio_list is sorted by increasing values */
+	req->prio = prio;
 	req->data_handle = data_handle;
 	req->node_tag.rank = srcdst;
 	req->node_tag.data_tag = data_tag;
@@ -358,6 +375,8 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 	req->func = func;
 	req->sequential_consistency = sequential_consistency;
 	req->is_internal_req = is_internal_req;
+	/* For internal requests, we wait for both the request completion and the matching application request completion */
+	req->to_destroy = !is_internal_req;
 	req->count = count;
 
 	/* Asynchronously request StarPU to fetch the data in main memory: when
@@ -534,10 +553,10 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 
 static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
 							int dest, int data_tag, MPI_Comm comm,
-							unsigned detached, unsigned sync, void (*callback)(void *), void *arg,
+							unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
 							int sequential_consistency)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, dest, data_tag, comm, detached, sync, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func,
+	return _starpu_mpi_isend_irecv_common(data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func,
 #ifdef STARPU_MPI_PEDANTIC_ISEND
 					      STARPU_RW,
 #else
@@ -546,14 +565,14 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 					      sequential_consistency, 0, 0);
 }
 
-int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int data_tag, MPI_Comm comm)
+int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int data_tag, int prio, MPI_Comm comm)
 {
 	_STARPU_MPI_LOG_IN();
 	STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_isend needs a valid starpu_mpi_req");
 
 	struct _starpu_mpi_req *req;
 	_STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, data_tag, 0);
-	req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 0, NULL, NULL, 1);
+	req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 0, prio, NULL, NULL, 1);
 	_STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, data_tag, 0);
 
 	STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
@@ -563,16 +582,24 @@ int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 	return 0;
 }
 
-int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
-			      int dest, int data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int data_tag, MPI_Comm comm)
+{
+	return starpu_mpi_isend_prio(data_handle, public_req, dest, data_tag, 0, comm);
+}
+
+int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, int data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
 	_STARPU_MPI_LOG_IN();
-	_starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 0, callback, arg, 1);
+	_starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 0, prio, callback, arg, 1);
 	_STARPU_MPI_LOG_OUT();
 	return 0;
 }
+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+{
+	return starpu_mpi_isend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
+}
 
-int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int data_tag, MPI_Comm comm)
+int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, int data_tag, int prio, MPI_Comm comm)
 {
 	starpu_mpi_req req;
 	MPI_Status status;
@@ -580,20 +607,25 @@ int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int data_tag, MP
 	_STARPU_MPI_LOG_IN();
 	memset(&status, 0, sizeof(MPI_Status));
 
-	starpu_mpi_isend(data_handle, &req, dest, data_tag, comm);
+	starpu_mpi_isend_prio(data_handle, &req, dest, data_tag, prio, comm);
 	starpu_mpi_wait(&req, &status);
 
 	_STARPU_MPI_LOG_OUT();
 	return 0;
 }
 
-int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int data_tag, MPI_Comm comm)
+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int data_tag, MPI_Comm comm)
+{
+	return starpu_mpi_send_prio(data_handle, dest, data_tag, 0, comm);
+}
+
+int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int data_tag, int prio, MPI_Comm comm)
 {
 	_STARPU_MPI_LOG_IN();
 	STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_issend needs a valid starpu_mpi_req");
 
 	struct _starpu_mpi_req *req;
-	req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 1, NULL, NULL, 1);
+	req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 1, prio, NULL, NULL, 1);
 
 	STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
 	*public_req = req;
@@ -602,16 +634,26 @@ int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *public_r
 	return 0;
 }
 
-int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, int data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int data_tag, MPI_Comm comm)
+{
+	return starpu_mpi_issend_prio(data_handle, public_req, dest, data_tag, 0, comm);
+}
+
+int starpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dest, int data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
 	_STARPU_MPI_LOG_IN();
 
-	_starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 1, callback, arg, 1);
+	_starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 1, prio, callback, arg, 1);
 
 	_STARPU_MPI_LOG_OUT();
 	return 0;
 }
 
+int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, int data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+{
+	return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
+}
+
 /********************************************************/
 /*                                                      */
 /*  receive functionalities                             */
@@ -670,7 +712,7 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, source, data_tag, comm, detached, sync, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency, is_internal_req, count);
+	return _starpu_mpi_isend_irecv_common(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency, is_internal_req, count);
 }
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int data_tag, MPI_Comm comm)
@@ -795,6 +837,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
 	/* Initialize the request structure */
 	 _starpu_mpi_request_init(&waiting_req);
+	waiting_req->prio = INT_MAX;
 	waiting_req->status = status;
 	waiting_req->other_request = req;
 	waiting_req->func = _starpu_mpi_wait_func;
@@ -886,6 +929,7 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 		/* Initialize the request structure */
 		_starpu_mpi_request_init(&testing_req);
+		testing_req->prio = INT_MAX;
 		testing_req->flag = flag;
 		testing_req->status = status;
 		testing_req->other_request = req;
@@ -978,6 +1022,7 @@ int _starpu_mpi_barrier(MPI_Comm comm)
 
 	/* Initialize the request structure */
 	_starpu_mpi_request_init(&barrier_req);
+	barrier_req->prio = INT_MAX;
 	barrier_req->func = _starpu_mpi_barrier_func;
 	barrier_req->request_type = BARRIER_REQ;
 	barrier_req->node_tag.comm = comm;
@@ -1013,7 +1058,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
 {
 	switch (request_type)
-		{
+	{
 		case SEND_REQ: return "SEND_REQ";
 		case RECV_REQ: return "RECV_REQ";
 		case WAIT_REQ: return "WAIT_REQ";
@@ -1021,7 +1066,7 @@ static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type
 		case BARRIER_REQ: return "BARRIER_REQ";
 		case UNKNOWN_REQ: return "UNSET_REQ";
 		default: return "unknown request type";
-		}
+	}
 }
 #endif
 
@@ -1137,6 +1182,20 @@ static void _starpu_mpi_early_data_cb(void* arg)
 	{
 		if (args->req->detached)
 		{
+			/* have the internal request destroyed now or when completed */
+			STARPU_PTHREAD_MUTEX_LOCK(&args->req->internal_req->req_mutex);
+			if (args->req->internal_req->to_destroy)
+			{
+				/* The request completed first, can now destroy it */
+				STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->internal_req->req_mutex);
+				_starpu_mpi_request_destroy(args->req->internal_req);
+			}
+			else
+			{
+				/* The request didn't complete yet, tell it to destroy it when it completes */
+				args->req->internal_req->to_destroy = 1;
+				STARPU_PTHREAD_MUTEX_UNLOCK(&args->req->internal_req->req_mutex);
+			}
 			_starpu_mpi_handle_request_termination(args->req);
 			_starpu_mpi_request_destroy(args->req);
 		}
@@ -1166,11 +1225,20 @@ static void _starpu_mpi_test_detached_requests(void)
 
 	STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 
-	req = _starpu_mpi_req_list_begin(detached_requests);
-	while (req != _starpu_mpi_req_list_end(detached_requests))
+	if (_starpu_mpi_req_list_empty(&detached_requests))
 	{
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+		//_STARPU_MPI_LOG_OUT();
+		return;
+	}
 
+	_STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN();
+	req = _starpu_mpi_req_list_begin(&detached_requests);
+	while (req != _starpu_mpi_req_list_end(&detached_requests))
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+
+		_STARPU_MPI_TRACE_TEST_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
 		//_STARPU_MPI_DEBUG(3, "Test detached request %p - mpitag %d - TYPE %s %d\n", &req->data_request, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->node_tag.rank);
 #ifdef STARPU_SIMGRID
 		req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, &flag);
@@ -1180,6 +1248,7 @@ static void _starpu_mpi_test_detached_requests(void)
 #endif
 
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
+		_STARPU_MPI_TRACE_TEST_END(req->node_tag.rank, req->node_tag.data_tag);
 
 		if (!flag)
 		{
@@ -1193,14 +1262,27 @@ static void _starpu_mpi_test_detached_requests(void)
 			_STARPU_MPI_TRACE_COMPLETE_BEGIN(req->request_type, req->node_tag.rank, req->node_tag.data_tag);
 
 			STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
-			_starpu_mpi_req_list_erase(detached_requests, req);
+			if (req->request_type == SEND_REQ)
+				detached_send_nrequests--;
+			_starpu_mpi_req_list_erase(&detached_requests, req);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 			_starpu_mpi_handle_request_termination(req);
 
 			_STARPU_MPI_TRACE_COMPLETE_END(req->request_type, req->node_tag.rank, req->node_tag.data_tag);
 
-			if (req->is_internal_req == 0)
+			STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
+			/* We don't want to free internal non-detached
+			   requests, we need to get their MPI request before
+			   destroying them */
+			if (req->is_internal_req && !req->to_destroy)
 			{
+				/* We have completed the request, let the application request destroy it */
+				req->to_destroy = 1;
+				STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+			}
+			else
+			{
+				STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
 				_starpu_mpi_request_destroy(req);
 			}
 
@@ -1209,6 +1291,7 @@ static void _starpu_mpi_test_detached_requests(void)
 
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 	}
+	_STARPU_MPI_TRACE_TESTING_DETACHED_END();
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 	//_STARPU_MPI_LOG_OUT();
@@ -1221,7 +1304,9 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 		/* put the submitted request into the list of pending requests
 		 * so that it can be handled by the progression mechanisms */
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
-		_starpu_mpi_req_list_push_front(detached_requests, req);
+		if (req->request_type == SEND_REQ)
+			detached_send_nrequests++;
+		_starpu_mpi_req_list_push_back(&detached_requests, req);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
 		starpu_wake_all_blocked_workers();
@@ -1305,7 +1390,7 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 
 	// Handle the request immediatly to make sure the mpi_irecv is
 	// posted before receiving an other envelope
-	_starpu_mpi_req_list_erase(ready_requests, early_data_handle->req);
+	_starpu_mpi_req_list_erase(&ready_recv_requests, early_data_handle->req);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 	_starpu_mpi_handle_ready_request(early_data_handle->req);
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
@@ -1318,7 +1403,12 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	starpu_pthread_setname("MPI");
 
 #ifndef STARPU_SIMGRID
+	if (mpi_thread_cpuid >= 0)
+		_starpu_bind_thread_on_cpu(mpi_thread_cpuid, STARPU_NOWORKERID);
 	_starpu_mpi_do_initialize(argc_argv);
+	if (mpi_thread_cpuid >= 0)
+		/* In case MPI changed the binding */
+		_starpu_bind_thread_on_cpu(mpi_thread_cpuid, STARPU_NOWORKERID);
 #endif
 
 	_starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
@@ -1381,13 +1471,13 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
  	int envelope_request_submitted = 0;
 
-	while (running || posted_requests || !(_starpu_mpi_req_list_empty(ready_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))// || !(_starpu_mpi_early_request_count()) || !(_starpu_mpi_sync_data_count()))
+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(&ready_recv_requests)) || !(_starpu_mpi_req_prio_list_empty(&ready_send_requests)) || !(_starpu_mpi_req_list_empty(&detached_requests)))// || !(_starpu_mpi_early_request_count()) || !(_starpu_mpi_sync_data_count()))
 	{
 #ifdef STARPU_SIMGRID
 		starpu_pthread_wait_reset(&wait);
 #endif
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(ready_requests) && _starpu_mpi_early_request_count() == 0 && _starpu_mpi_sync_data_count() == 0 && _starpu_mpi_req_list_empty(detached_requests);
+		unsigned block = _starpu_mpi_req_list_empty(&ready_recv_requests) && _starpu_mpi_req_prio_list_empty(&ready_send_requests) && _starpu_mpi_early_request_count() == 0 && _starpu_mpi_sync_data_count() == 0 && _starpu_mpi_req_list_empty(&detached_requests);
 
 		if (block)
 		{
@@ -1402,17 +1492,38 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			_STARPU_MPI_TRACE_SLEEP_END();
 		}
 
-		/* get one request */
-		int n = 0;
-		while (!_starpu_mpi_req_list_empty(ready_requests))
+		/* get one recv request */
+		unsigned n = 0;
+		while (!_starpu_mpi_req_list_empty(&ready_recv_requests))
 		{
 			struct _starpu_mpi_req *req;
 
-			if (n++ == NREADY_PROCESS)
-				/* Already spent some time on submitting ready requests, poll before processing more ready requests */
+			if (n++ == nready_process)
+				/* Already spent some time on submitting ready recv requests, poll before processing more ready recv requests */
 				break;
 
-			req = _starpu_mpi_req_list_pop_back(ready_requests);
+			req = _starpu_mpi_req_list_pop_back(&ready_recv_requests);
+
+			/* handling a request is likely to block for a while
+			 * (on a sync_data_with_mem call), we want to let the
+			 * application submit requests in the meantime, so we
+			 * release the lock. */
+			STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
+			_starpu_mpi_handle_ready_request(req);
+			STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+		}
+
+		/* get one send request */
+		n = 0;
+		while (!_starpu_mpi_req_prio_list_empty(&ready_send_requests) && detached_send_nrequests < ndetached_send)
+		{
+			struct _starpu_mpi_req *req;
+
+			if (n++ == nready_process)
+				/* Already spent some time on submitting ready send requests, poll before processing more ready send requests */
+				break;
+
+			req = _starpu_mpi_req_prio_list_pop_back_highest(&ready_send_requests);
 
 			/* handling a request is likely to block for a while
 			 * (on a sync_data_with_mem call), we want to let the
@@ -1573,8 +1684,10 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	starpu_pthread_wait_destroy(&wait);
 #endif
 
-	STARPU_MPI_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
-	STARPU_MPI_ASSERT_MSG(_starpu_mpi_req_list_empty(ready_requests), "List of ready requests not empty");
+	STARPU_MPI_ASSERT_MSG(_starpu_mpi_req_list_empty(&detached_requests), "List of detached requests not empty");
+	STARPU_MPI_ASSERT_MSG(detached_send_nrequests == 0, "Number of detached send requests not 0");
+	STARPU_MPI_ASSERT_MSG(_starpu_mpi_req_list_empty(&ready_recv_requests), "List of ready requests not empty");
+	STARPU_MPI_ASSERT_MSG(_starpu_mpi_req_prio_list_empty(&ready_send_requests), "List of ready requests not empty");
 	STARPU_MPI_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
 	_starpu_mpi_early_request_check_termination();
 	_starpu_mpi_early_data_check_termination();
@@ -1636,13 +1749,17 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
         STARPU_PTHREAD_COND_INIT(&barrier_cond, NULL);
-        ready_requests = _starpu_mpi_req_list_new();
+	_starpu_mpi_req_list_init(&ready_recv_requests);
+	_starpu_mpi_req_prio_list_init(&ready_send_requests);
 
         STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
-        detached_requests = _starpu_mpi_req_list_new();
+	_starpu_mpi_req_list_init(&detached_requests);
 
         STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
         _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
+	nready_process = starpu_get_env_number_default("STARPU_MPI_NREADY_PROCESS", 10);
+	ndetached_send = starpu_get_env_number_default("STARPU_MPI_NDETACHED_SEND", 10);
+	mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
 
 #ifdef STARPU_SIMGRID
 	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);
@@ -1691,13 +1808,9 @@ void _starpu_mpi_progress_shutdown(int *value)
 	(void) value;
 	MSG_process_sleep(1);
 #else
-	starpu_pthread_join(progress_thread, (void *)value);
+	STARPU_PTHREAD_JOIN(progress_thread, (void *)value);
 #endif
 
-        /* free the request queues */
-        _starpu_mpi_req_list_delete(detached_requests);
-        _starpu_mpi_req_list_delete(ready_requests);
-
         STARPU_PTHREAD_MUTEX_DESTROY(&mutex_posted_requests);
         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
         STARPU_PTHREAD_COND_DESTROY(&barrier_cond);
@@ -1776,7 +1889,8 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 	}
 
 	starpu_mpi_comm_rank(comm, &me);
-	if (node == rank) return;
+	if (node == rank)
+		return;
 
 	tag = starpu_mpi_data_get_tag(data_handle);
 	if (tag == -1)
@@ -1817,7 +1931,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	}
 
 	starpu_mpi_comm_rank(comm, &me);
-	if (node == rank) return;
+	if (node == rank)
+		return;
 
 	tag = starpu_mpi_data_get_tag(data_handle);
 	if (tag == -1)
@@ -1848,6 +1963,17 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	}
 }
 
+void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	int size, i;
+	starpu_mpi_comm_size(comm, &size);
+#ifdef STARPU_DEVEL
+#warning TODO: use binary communication tree to optimize broadcast
+#endif
+	for (i = 0; i < size; i++)
+		starpu_mpi_get_data_on_node_detached(comm, data_handle, i, NULL, NULL);
+}
+
 void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_rank)
 {
 	int old_rank = starpu_mpi_data_get_rank(data);

+ 41 - 18
mpi/src/starpu_mpi_cache.c

@@ -85,7 +85,8 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 
 void _starpu_mpi_cache_shutdown()
 {
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	struct _starpu_data_entry *entry, *tmp;
 
@@ -104,13 +105,22 @@ void _starpu_mpi_cache_shutdown()
 void _starpu_mpi_cache_data_clear(starpu_data_handle_t data_handle)
 {
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+	struct _starpu_data_entry *entry;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 1)
+	{
+		STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
+		_starpu_mpi_cache_flush_nolock(data_handle);
+		HASH_FIND_PTR(_cache_data, &data_handle, entry);
+		if (entry != NULL)
+		{
+			HASH_DEL(_cache_data, entry);
+			free(entry);
+		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
+	}
 
-	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
-	_starpu_mpi_cache_flush_nolock(data_handle);
 	free(mpi_data->cache_sent);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&_cache_mutex);
 }
 
 void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle)
@@ -118,7 +128,8 @@ void _starpu_mpi_cache_data_init(starpu_data_handle_t data_handle)
 	int i;
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	mpi_data->cache_received = 0;
@@ -134,7 +145,8 @@ static void _starpu_mpi_cache_data_add_nolock(starpu_data_handle_t data_handle)
 {
 	struct _starpu_data_entry *entry;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	HASH_FIND_PTR(_cache_data, &data_handle, entry);
 	if (entry == NULL)
@@ -149,7 +161,8 @@ static void _starpu_mpi_cache_data_remove_nolock(starpu_data_handle_t data_handl
 {
 	struct _starpu_data_entry *entry;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	HASH_FIND_PTR(_cache_data, &data_handle, entry);
 	if (entry)
@@ -167,7 +180,8 @@ void _starpu_mpi_cache_received_data_clear(starpu_data_handle_t data_handle)
 	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	STARPU_ASSERT(mpi_data->magic == 42);
@@ -192,7 +206,8 @@ int _starpu_mpi_cache_received_data_set(starpu_data_handle_t data_handle)
 	int mpi_rank = starpu_mpi_data_get_rank(data_handle);
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	if (_starpu_cache_enabled == 0) return 0;
+	if (_starpu_cache_enabled == 0)
+		return 0;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	STARPU_ASSERT(mpi_data->magic == 42);
@@ -219,7 +234,8 @@ int _starpu_mpi_cache_received_data_get(starpu_data_handle_t data_handle)
 	int already_received;
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	if (_starpu_cache_enabled == 0) return 0;
+	if (_starpu_cache_enabled == 0)
+		return 0;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	STARPU_ASSERT(mpi_data->magic == 42);
@@ -241,7 +257,8 @@ void _starpu_mpi_cache_sent_data_clear(starpu_data_handle_t data_handle)
 	int n, size;
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	starpu_mpi_comm_size(mpi_data->node_tag.comm, &size);
@@ -261,7 +278,8 @@ int _starpu_mpi_cache_sent_data_set(starpu_data_handle_t data_handle, int dest)
 {
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 
-	if (_starpu_cache_enabled == 0) return 0;
+	if (_starpu_cache_enabled == 0)
+		return 0;
 
 	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
 
@@ -286,7 +304,8 @@ int _starpu_mpi_cache_sent_data_get(starpu_data_handle_t data_handle, int dest)
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 	int already_sent;
 
-	if (_starpu_cache_enabled == 0) return 0;
+	if (_starpu_cache_enabled == 0)
+		return 0;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	STARPU_MPI_ASSERT_MSG(dest < _starpu_cache_comm_size, "Node %d invalid. Max node is %d\n", dest, _starpu_cache_comm_size);
@@ -305,7 +324,8 @@ static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle)
 	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
 	int i, nb_nodes;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	starpu_mpi_comm_size(mpi_data->node_tag.comm, &nb_nodes);
 	for(i=0 ; i<nb_nodes ; i++)
@@ -329,7 +349,8 @@ static void _starpu_mpi_cache_flush_nolock(starpu_data_handle_t data_handle)
 
 void _starpu_mpi_cache_flush(starpu_data_handle_t data_handle)
 {
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	_starpu_mpi_cache_flush_nolock(data_handle);
@@ -350,7 +371,8 @@ static void _starpu_mpi_cache_flush_and_invalidate_nolock(MPI_Comm comm, starpu_
 
 void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 {
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	_starpu_mpi_cache_flush_and_invalidate_nolock(comm, data_handle);
@@ -362,7 +384,8 @@ void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 {
 	struct _starpu_data_entry *entry, *tmp;
 
-	if (_starpu_cache_enabled == 0) return;
+	if (_starpu_cache_enabled == 0)
+		return;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_cache_mutex);
 	HASH_ITER(hh, _cache_data, entry, tmp)

+ 6 - 3
mpi/src/starpu_mpi_cache_stats.c

@@ -28,7 +28,8 @@ void _starpu_mpi_cache_stats_init()
 	{
 		stats_enabled = 0;
 	}
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
 	_STARPU_DISP("Warning: StarPU is executed with STARPU_MPI_CACHE_STATS=1, which slows down a bit\n");
 
@@ -36,14 +37,16 @@ void _starpu_mpi_cache_stats_init()
 
 void _starpu_mpi_cache_stats_shutdown()
 {
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 }
 
 void _starpu_mpi_cache_stats_update(unsigned dst, starpu_data_handle_t data_handle, int count)
 {
 	size_t size;
 
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
 	size = starpu_data_get_size(data_handle);
 

+ 4 - 1
mpi/src/starpu_mpi_comm.c

@@ -181,7 +181,10 @@ int _starpu_mpi_comm_test_recv(MPI_Status *status, struct _starpu_mpi_envelope *
 			}
 		}
 		i++;
-		if (i == _starpu_mpi_comm_nb) i=0;
+		if (i == _starpu_mpi_comm_nb)
+		{
+			i=0;
+		}
 		if (i == _starpu_mpi_comm_tested)
 		{
 			// We have tested all the requests, none has completed

+ 17 - 29
mpi/src/starpu_mpi_datatype.c

@@ -41,26 +41,6 @@ void _starpu_mpi_datatype_shutdown(void)
 }
 
 /*
- * 	Bcsr
- */
-
-static void handle_to_datatype_bcsr(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
-{
-	int ret;
-
-	uint32_t r = starpu_bcsr_get_r(data_handle);
-	uint32_t c = starpu_bcsr_get_c(data_handle);
-	uint32_t nnz = starpu_bcsr_get_nnz(data_handle);
-	size_t elemsize = starpu_bcsr_get_elemsize(data_handle);
-
-	ret = MPI_Type_contiguous(r*c*nnz*elemsize, MPI_BYTE, datatype);
-	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
-
-	ret = MPI_Type_commit(datatype);
-	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
-}
-
-/*
  * 	Matrix
  */
 
@@ -168,8 +148,8 @@ static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_I
 	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
 	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
 	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
-	[STARPU_CSR_INTERFACE_ID]	= NULL,
-	[STARPU_BCSR_INTERFACE_ID]	= handle_to_datatype_bcsr,
+	[STARPU_CSR_INTERFACE_ID]	= NULL, /* Sent through pack/unpack operations */
+	[STARPU_BCSR_INTERFACE_ID]	= NULL, /* Sent through pack/unpack operations */
 	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
 	[STARPU_VOID_INTERFACE_ID]	= handle_to_datatype_void,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
@@ -182,9 +162,17 @@ void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _sta
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{
 		starpu_mpi_datatype_allocate_func_t func = handle_to_datatype_funcs[id];
-		STARPU_ASSERT_MSG(func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
-		func(data_handle, &req->datatype);
-		req->registered_datatype = 1;
+		if (func)
+		{
+			func(data_handle, &req->datatype);
+			req->registered_datatype = 1;
+		}
+		else
+		{
+			/* The datatype is predefined by StarPU but it will be sent as a memory area */
+			req->datatype = MPI_BYTE;
+			req->registered_datatype = 0;
+		}
 	}
 	else
 	{
@@ -256,8 +244,8 @@ static starpu_mpi_datatype_free_func_t handle_free_datatype_funcs[STARPU_MAX_INT
 	[STARPU_MATRIX_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
 	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
-	[STARPU_CSR_INTERFACE_ID]	= NULL,
-	[STARPU_BCSR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
+	[STARPU_CSR_INTERFACE_ID]	= NULL,  /* Sent through pack/unpack operations */
+	[STARPU_BCSR_INTERFACE_ID]	= NULL,  /* Sent through pack/unpack operations */
 	[STARPU_VARIABLE_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_VOID_INTERFACE_ID]      = _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
@@ -270,8 +258,8 @@ void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *d
 	if (id < STARPU_MAX_INTERFACE_ID)
 	{
 		starpu_mpi_datatype_free_func_t func = handle_free_datatype_funcs[id];
-		STARPU_ASSERT_MSG(func, "Handle free datatype function not defined for StarPU data interface %d", id);
-		func(datatype);
+		if (func)
+			func(datatype);
 	}
 	else
 	{

+ 7 - 7
mpi/src/starpu_mpi_early_data.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2014  Université de Bordeaux
+ * Copyright (C) 2009, 2010-2014, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,7 +23,7 @@
 
 struct _starpu_mpi_early_data_handle_hashlist
 {
-	struct _starpu_mpi_early_data_handle_list *list;
+	struct _starpu_mpi_early_data_handle_list list;
 	UT_hash_handle hh;
 	struct _starpu_mpi_node_tag node_tag;
 };
@@ -50,7 +50,7 @@ void _starpu_mpi_early_data_shutdown(void)
 	struct _starpu_mpi_early_data_handle_hashlist *current, *tmp;
 	HASH_ITER(hh, _starpu_mpi_early_data_handle_hashmap, current, tmp)
 	{
-		_starpu_mpi_early_data_handle_list_delete(current->list);
+		STARPU_ASSERT(_starpu_mpi_early_data_handle_list_empty(&current->list));
 		HASH_DEL(_starpu_mpi_early_data_handle_hashmap, current);
 		free(current);
 	}
@@ -84,14 +84,14 @@ struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(struct _starpu
 	}
 	else
 	{
-		if (_starpu_mpi_early_data_handle_list_empty(hashlist->list))
+		if (_starpu_mpi_early_data_handle_list_empty(&hashlist->list))
 		{
 			early_data_handle = NULL;
 		}
 		else
 		{
 			_starpu_mpi_early_data_handle_hashmap_count --;
-			early_data_handle = _starpu_mpi_early_data_handle_list_pop_front(hashlist->list);
+			early_data_handle = _starpu_mpi_early_data_handle_list_pop_front(&hashlist->list);
 		}
 	}
 	_STARPU_MPI_DEBUG(60, "Found early_data_handle %p with comm %ld source %d tag %d\n", early_data_handle, (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
@@ -110,11 +110,11 @@ void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data
 	if (hashlist == NULL)
 	{
 		_STARPU_MPI_MALLOC(hashlist, sizeof(struct _starpu_mpi_early_data_handle_hashlist));
-		hashlist->list = _starpu_mpi_early_data_handle_list_new();
+		_starpu_mpi_early_data_handle_list_init(&hashlist->list);
 		hashlist->node_tag = early_data_handle->node_tag;
 		HASH_ADD(hh, _starpu_mpi_early_data_handle_hashmap, node_tag, sizeof(hashlist->node_tag), hashlist);
 	}
-	_starpu_mpi_early_data_handle_list_push_back(hashlist->list, early_data_handle);
+	_starpu_mpi_early_data_handle_list_push_back(&hashlist->list, early_data_handle);
 	_starpu_mpi_early_data_handle_hashmap_count ++;
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_data_handle_mutex);
 }

+ 7 - 7
mpi/src/starpu_mpi_early_request.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2014, 2016  Université de Bordeaux
+ * Copyright (C) 2009, 2010-2014, 2016-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,7 +24,7 @@
 /** stores application requests for which data have not been received yet */
 struct _starpu_mpi_early_request_hashlist
 {
-	struct _starpu_mpi_req_list *list;
+	struct _starpu_mpi_req_list list;
 	UT_hash_handle hh;
 	struct _starpu_mpi_node_tag node_tag;
 };
@@ -45,8 +45,8 @@ void _starpu_mpi_early_request_shutdown()
 	struct _starpu_mpi_early_request_hashlist *entry, *tmp;
 	HASH_ITER(hh, _starpu_mpi_early_request_hash, entry, tmp)
 	{
+		STARPU_ASSERT(_starpu_mpi_req_list_empty(&entry->list));
 		HASH_DEL(_starpu_mpi_early_request_hash, entry);
-		_starpu_mpi_req_list_delete(entry->list);
 		free(entry);
 	}
 	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_early_request_mutex);
@@ -82,13 +82,13 @@ struct _starpu_mpi_req* _starpu_mpi_early_request_dequeue(int data_tag, int sour
 	}
 	else
 	{
-		if (_starpu_mpi_req_list_empty(hashlist->list))
+		if (_starpu_mpi_req_list_empty(&hashlist->list))
 		{
 			found = NULL;
 		}
 		else
 		{
-			found = _starpu_mpi_req_list_pop_front(hashlist->list);
+			found = _starpu_mpi_req_list_pop_front(&hashlist->list);
 			_starpu_mpi_early_request_hash_count --;
 		}
 	}
@@ -107,11 +107,11 @@ void _starpu_mpi_early_request_enqueue(struct _starpu_mpi_req *req)
 	if (hashlist == NULL)
 	{
 		_STARPU_MPI_MALLOC(hashlist, sizeof(struct _starpu_mpi_early_request_hashlist));
-		hashlist->list = _starpu_mpi_req_list_new();
+		_starpu_mpi_req_list_init(&hashlist->list);
 		hashlist->node_tag = req->node_tag;
 		HASH_ADD(hh, _starpu_mpi_early_request_hash, node_tag, sizeof(hashlist->node_tag), hashlist);
 	}
-	_starpu_mpi_req_list_push_back(hashlist->list, req);
+	_starpu_mpi_req_list_push_back(&hashlist->list, req);
 	_starpu_mpi_early_request_hash_count ++;
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_request_mutex);
 }

+ 35 - 2
mpi/src/starpu_mpi_fortran.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2016, 2017  CNRS
  * Copyright (C) 2016  Inria
+ * Copyright (C) 2017  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -120,6 +121,10 @@ void fstarpu_mpi_redux_data(MPI_Fint comm, starpu_data_handle_t data_handle)
 {
 	starpu_mpi_redux_data(MPI_Comm_f2c(comm), data_handle);
 }
+void fstarpu_mpi_redux_data_prio(MPI_Fint comm, starpu_data_handle_t data_handle, int prio)
+{
+	starpu_mpi_redux_data_prio(MPI_Comm_f2c(comm), data_handle, prio);
+}
 
 /* scatter/gather */
 int fstarpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int cnt, int root, MPI_Fint comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
@@ -137,6 +142,10 @@ int fstarpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int
 {
 	return starpu_mpi_isend_detached_unlock_tag(data_handle, dst, mpi_tag, MPI_Comm_f2c(comm), *starpu_tag);
 }
+int fstarpu_mpi_isend_detached_unlock_tag_prio(starpu_data_handle_t data_handle, int dst, int mpi_tag, int prio, MPI_Fint comm, starpu_tag_t *starpu_tag)
+{
+	return starpu_mpi_isend_detached_unlock_tag_prio(data_handle, dst, mpi_tag, prio, MPI_Comm_f2c(comm), *starpu_tag);
+}
 
 int fstarpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int src, int mpi_tag, MPI_Fint comm, starpu_tag_t *starpu_tag)
 {
@@ -144,7 +153,7 @@ int fstarpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int
 }
 
 /* isend/irecv array detached unlock tag */
-int fstarpu_mpi_isend_array_detached_unlock_tag(int array_size, starpu_data_handle_t *data_handles, int *dsts, int *mpi_tags, MPI_Fint *_comms, starpu_tag_t *starpu_tag)
+int fstarpu_mpi_isend_array_detached_unlock_tag_prio(int array_size, starpu_data_handle_t *data_handles, int *dsts, int *mpi_tags, int *prio, MPI_Fint *_comms, starpu_tag_t *starpu_tag)
 {
 	MPI_Comm comms[array_size];
 	int i;
@@ -152,9 +161,13 @@ int fstarpu_mpi_isend_array_detached_unlock_tag(int array_size, starpu_data_hand
 	{
 		comms[i] = MPI_Comm_f2c(_comms[i]);
 	}
-	int ret = starpu_mpi_isend_array_detached_unlock_tag((unsigned)array_size, data_handles, dsts, mpi_tags, comms, *starpu_tag);
+	int ret = starpu_mpi_isend_array_detached_unlock_tag_prio((unsigned)array_size, data_handles, dsts, mpi_tags, prio, comms, *starpu_tag);
 	return ret;
 }
+int fstarpu_mpi_isend_array_detached_unlock_tag(int array_size, starpu_data_handle_t *data_handles, int *dsts, int *mpi_tags, MPI_Fint *_comms, starpu_tag_t *starpu_tag)
+{
+	return fstarpu_mpi_isend_array_detached_unlock_tag_prio(array_size, data_handles, dsts, mpi_tags, NULL, _comms, starpu_tag);
+}
 
 int fstarpu_mpi_irecv_array_detached_unlock_tag(int array_size, starpu_data_handle_t *data_handles, int *srcs, int *mpi_tags, MPI_Fint *_comms, starpu_tag_t *starpu_tag)
 {
@@ -173,6 +186,10 @@ int fstarpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int
 {
 	return starpu_mpi_isend(data_handle, req, dst, mpi_tag, MPI_Comm_f2c(comm));
 }
+int fstarpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dst, int mpi_tag, int prio, MPI_Fint comm)
+{
+	return starpu_mpi_isend_prio(data_handle, req, dst, mpi_tag, prio, MPI_Comm_f2c(comm));
+}
 
 int fstarpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int src, int mpi_tag, MPI_Fint comm)
 {
@@ -184,6 +201,10 @@ int fstarpu_mpi_send(starpu_data_handle_t data_handle, int dst, int mpi_tag, MPI
 {
 	return starpu_mpi_send(data_handle, dst, mpi_tag, MPI_Comm_f2c(comm));
 }
+int fstarpu_mpi_send_prio(starpu_data_handle_t data_handle, int dst, int mpi_tag, int prio, MPI_Fint comm)
+{
+	return starpu_mpi_send_prio(data_handle, dst, mpi_tag, prio, MPI_Comm_f2c(comm));
+}
 
 int fstarpu_mpi_recv(starpu_data_handle_t data_handle, int src, int mpi_tag, MPI_Fint comm, MPI_Status *status)
 {
@@ -195,6 +216,10 @@ int fstarpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dst, int mp
 {
 	return starpu_mpi_isend_detached(data_handle, dst, mpi_tag, MPI_Comm_f2c(comm), callback, arg);
 }
+int fstarpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dst, int mpi_tag, int prio, MPI_Fint comm, void (*callback)(void *), void *arg)
+{
+	return starpu_mpi_isend_detached_prio(data_handle, dst, mpi_tag, prio, MPI_Comm_f2c(comm), callback, arg);
+}
 
 int fstarpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int src, int mpi_tag, MPI_Fint comm, void (*callback)(void *), void *arg)
 {
@@ -206,11 +231,19 @@ int fstarpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *req, in
 {
 	return starpu_mpi_issend(data_handle, req, dst, mpi_tag, MPI_Comm_f2c(comm));
 }
+int fstarpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dst, int mpi_tag, int prio, MPI_Fint comm)
+{
+	return starpu_mpi_issend_prio(data_handle, req, dst, mpi_tag, prio, MPI_Comm_f2c(comm));
+}
 
 int fstarpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dst, int mpi_tag, MPI_Fint comm, void (*callback)(void *), void *arg)
 {
 	return starpu_mpi_issend_detached(data_handle, dst, mpi_tag, MPI_Comm_f2c(comm), callback, arg);
 }
+int fstarpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dst, int mpi_tag, int prio, MPI_Fint comm, void (*callback)(void *), void *arg)
+{
+	return starpu_mpi_issend_detached_prio(data_handle, dst, mpi_tag, prio, MPI_Comm_f2c(comm), callback, arg);
+}
 
 /* cache */
 void fstarpu_mpi_cache_flush(MPI_Fint comm, starpu_data_handle_t data_handle)

+ 24 - 0
mpi/src/starpu_mpi_fxt.h

@@ -49,6 +49,10 @@ extern "C"
 #define _STARPU_MPI_FUT_DATA_SET_RANK			0x521a
 #define _STARPU_MPI_FUT_IRECV_TERMINATED		0x521b
 #define _STARPU_MPI_FUT_ISEND_TERMINATED		0x521c
+#define _STARPU_MPI_FUT_TESTING_DETACHED_BEGIN		0x521d
+#define _STARPU_MPI_FUT_TESTING_DETACHED_END		0x521e
+#define _STARPU_MPI_FUT_TEST_BEGIN			0x521f
+#define _STARPU_MPI_FUT_TEST_END			0x5220
 
 #ifdef STARPU_USE_FXT
 #define _STARPU_MPI_TRACE_START(rank, worldsize)	\
@@ -98,6 +102,22 @@ extern "C"
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (mpi_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_DATA_SET_RANK(handle, rank)	\
 	FUT_DO_PROBE3(_STARPU_MPI_FUT_DATA_SET_RANK, (handle), (rank), _starpu_gettid());
+#if 0
+/* This is very expensive in the trace, only enable for debugging */
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_TESTING_DETACHED_BEGIN, _starpu_gettid());
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()	\
+	FUT_DO_PROBE1(_STARPU_MPI_FUT_TESTING_DETACHED_END, _starpu_gettid());
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_TEST_BEGIN, (peer), (mpi_tag), _starpu_gettid());
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)	\
+	FUT_DO_PROBE3(_STARPU_MPI_FUT_TEST_END, (peer), (mpi_tag), _starpu_gettid());
+#else
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()		do {} while(0)
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)		do {} while(0)
+#endif
 #define TRACE
 #else
 #define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
@@ -123,6 +143,10 @@ extern "C"
 #define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
 #define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
 #define _STARPU_MPI_TRACE_DATA_SET_RANK(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()		do {} while(0)
+#define _STARPU_MPI_TRACE_TESTING_DETACHED_END()		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_BEGIN(peer, mpi_tag)		do {} while(0)
+#define _STARPU_MPI_TRACE_TEST_END(peer, mpi_tag)		do {} while(0)
 #endif
 
 #ifdef __cplusplus

+ 19 - 6
mpi/src/starpu_mpi_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2015  Université de Bordeaux
+ * Copyright (C) 2010, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2012, 2014, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -27,13 +27,17 @@ static void starpu_mpi_unlock_tag_callback(void *arg)
 	free(tagptr);
 }
 
-int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int data_tag, MPI_Comm comm, starpu_tag_t tag)
+int starpu_mpi_isend_detached_unlock_tag_prio(starpu_data_handle_t data_handle, int dest, int data_tag, int prio, MPI_Comm comm, starpu_tag_t tag)
 {
 	starpu_tag_t *tagptr;
 	_STARPU_MPI_MALLOC(tagptr, sizeof(starpu_tag_t));
 	*tagptr = tag;
 
-	return starpu_mpi_isend_detached(data_handle, dest, data_tag, comm, starpu_mpi_unlock_tag_callback, tagptr);
+	return starpu_mpi_isend_detached_prio(data_handle, dest, data_tag, prio, comm, starpu_mpi_unlock_tag_callback, tagptr);
+}
+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int data_tag, MPI_Comm comm, starpu_tag_t tag)
+{
+	return starpu_mpi_isend_detached_unlock_tag_prio(data_handle, dest, data_tag, 0, comm, tag);
 }
 
 
@@ -65,8 +69,8 @@ static void starpu_mpi_array_unlock_callback(void *_arg)
 	}
 }
 
-int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
-		starpu_data_handle_t *data_handle, int *dest, int *data_tag,
+int starpu_mpi_isend_array_detached_unlock_tag_prio(unsigned array_size,
+		starpu_data_handle_t *data_handle, int *dest, int *data_tag, int *prio,
 		MPI_Comm *comm, starpu_tag_t tag)
 {
 	if (!array_size)
@@ -80,11 +84,20 @@ int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
 	unsigned elem;
 	for (elem = 0; elem < array_size; elem++)
 	{
-		starpu_mpi_isend_detached(data_handle[elem], dest[elem], data_tag[elem], comm[elem], starpu_mpi_array_unlock_callback, arg);
+		int p = 0;
+		if (prio)
+			p = prio[elem];
+		starpu_mpi_isend_detached_prio(data_handle[elem], dest[elem], data_tag[elem], p, comm[elem], starpu_mpi_array_unlock_callback, arg);
 	}
 
 	return 0;
 }
+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
+		starpu_data_handle_t *data_handle, int *dest, int *data_tag,
+		MPI_Comm *comm, starpu_tag_t tag)
+{
+	return starpu_mpi_isend_array_detached_unlock_tag_prio(array_size, data_handle, dest, data_tag, NULL, comm, tag);
+}
 
 
 int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *data_tag, MPI_Comm *comm, starpu_tag_t tag)

+ 7 - 2
mpi/src/starpu_mpi_private.h

@@ -24,6 +24,7 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi_fxt.h>
 #include <common/list.h>
+#include <common/prio_list.h>
 #include <core/simgrid.h>
 
 #ifdef __cplusplus
@@ -91,7 +92,7 @@ int _starpu_debug_rank;
 
 #define _STARPU_MPI_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) size); } while (0)
 #define _STARPU_MPI_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
-#define _STARPU_MPI_REALLOC(ptr, size) do { ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); } while (0)
+#define _STARPU_MPI_REALLOC(ptr, size) do { void *_new_ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(_new_ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); ptr = _new_ptr; } while (0)
 
 #ifdef STARPU_VERBOSE
 #  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) \
@@ -199,6 +200,8 @@ LIST_TYPE(_starpu_mpi_req,
 	/* description of the data at StarPU level */
 	starpu_data_handle_t data_handle;
 
+	int prio;
+
 	/* description of the data to be sent/received */
 	MPI_Datatype datatype;
 	char *datatype_name;
@@ -243,7 +246,8 @@ LIST_TYPE(_starpu_mpi_req,
 
         struct _starpu_mpi_envelope* envelope;
 
-	int is_internal_req;
+	unsigned is_internal_req:1;
+	unsigned to_destroy:1;
 	struct _starpu_mpi_req *internal_req;
 	struct _starpu_mpi_early_data_handle *early_data_handle;
 
@@ -261,6 +265,7 @@ LIST_TYPE(_starpu_mpi_req,
 #endif
 
 );
+PRIO_LIST_TYPE(_starpu_mpi_req, prio)
 
 struct _starpu_mpi_argc_argv
 {

+ 3 - 2
mpi/src/starpu_mpi_select_node.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2014, 2015, 2016  CNRS
+ * Copyright (C) 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -57,7 +57,8 @@ int starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_policy_func
 	// Look for a unregistered policy
 	while(i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY)
 	{
-		if (_policies[i] == NULL) break;
+		if (_policies[i] == NULL)
+			break;
 		i++;
 	}
 	STARPU_ASSERT_MSG(i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY, "No unused policy available. Unregister existing policies before registering a new one.");

+ 10 - 5
mpi/src/starpu_mpi_stats.c

@@ -32,7 +32,8 @@ void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 		stats_enabled = 0;
 	}
 
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
 	_STARPU_DISP("Warning: StarPU is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
 
@@ -44,7 +45,8 @@ void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 
 void _starpu_mpi_comm_amounts_shutdown()
 {
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 	free(comm_amount);
 }
 
@@ -52,7 +54,8 @@ void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype data
 {
 	int src, size;
 
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
 	starpu_mpi_comm_rank(comm, &src);
 	MPI_Type_size(datatype, &size);
@@ -64,7 +67,8 @@ void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype data
 
 void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
 {
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 	memcpy(comm_amounts, comm_amount, world_size * sizeof(size_t));
 }
 
@@ -73,7 +77,8 @@ void _starpu_mpi_comm_amounts_display(FILE *stream, int node)
 	int dst;
 	size_t sum = 0;
 
-	if (stats_enabled == 0) return;
+	if (stats_enabled == 0)
+		return;
 
 	for (dst = 0; dst < world_size; dst++)
 	{

+ 9 - 9
mpi/src/starpu_mpi_sync_data.c

@@ -22,7 +22,7 @@
 
 struct _starpu_mpi_sync_data_handle_hashlist
 {
-	struct _starpu_mpi_req_list *list;
+	struct _starpu_mpi_req_list list;
 	UT_hash_handle hh;
 	struct _starpu_mpi_node_tag node_tag;
 };
@@ -44,7 +44,7 @@ void _starpu_mpi_sync_data_shutdown(void)
 	struct _starpu_mpi_sync_data_handle_hashlist *current, *tmp;
 	HASH_ITER(hh, _starpu_mpi_sync_data_handle_hashmap, current, tmp)
 	{
-		_starpu_mpi_req_list_delete(current->list);
+		STARPU_ASSERT(_starpu_mpi_req_list_empty(&current->list));
 		HASH_DEL(_starpu_mpi_sync_data_handle_hashmap, current);
 		free(current);
 	}
@@ -62,15 +62,15 @@ void _starpu_mpi_sync_data_handle_display_hash(struct _starpu_mpi_node_tag *node
 	{
 		_STARPU_MPI_DEBUG(60, "Hashlist for comm %ld source %d and tag %d does not exist\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
 	}
-	else if (_starpu_mpi_req_list_empty(hashlist->list))
+	else if (_starpu_mpi_req_list_empty(&hashlist->list))
 	{
 		_STARPU_MPI_DEBUG(60, "Hashlist for comm %ld source %d and tag %d is empty\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
 	}
 	else
 	{
 		struct _starpu_mpi_req *cur;
-		for (cur = _starpu_mpi_req_list_begin(hashlist->list) ;
-		     cur != _starpu_mpi_req_list_end(hashlist->list);
+		for (cur = _starpu_mpi_req_list_begin(&hashlist->list) ;
+		     cur != _starpu_mpi_req_list_end(&hashlist->list);
 		     cur = _starpu_mpi_req_list_next(cur))
 		{
 			_STARPU_MPI_DEBUG(60, "Element for comm %ld source %d and tag %d: %p\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag, cur);
@@ -110,13 +110,13 @@ struct _starpu_mpi_req *_starpu_mpi_sync_data_find(int data_tag, int source, MPI
 	}
 	else
 	{
-		if (_starpu_mpi_req_list_empty(found->list))
+		if (_starpu_mpi_req_list_empty(&found->list))
 		{
 			req = NULL;
 		}
 		else
 		{
-			req = _starpu_mpi_req_list_pop_front(found->list);
+			req = _starpu_mpi_req_list_pop_front(&found->list);
 			_starpu_mpi_sync_data_handle_hashmap_count --;
 		}
 	}
@@ -136,11 +136,11 @@ void _starpu_mpi_sync_data_add(struct _starpu_mpi_req *sync_req)
 	if (hashlist == NULL)
 	{
 		_STARPU_MPI_MALLOC(hashlist, sizeof(struct _starpu_mpi_sync_data_handle_hashlist));
-		hashlist->list = _starpu_mpi_req_list_new();
+		_starpu_mpi_req_list_init(&hashlist->list);
 		hashlist->node_tag = sync_req->node_tag;
 		HASH_ADD(hh, _starpu_mpi_sync_data_handle_hashmap, node_tag, sizeof(hashlist->node_tag), hashlist);
 	}
-	_starpu_mpi_req_list_push_back(hashlist->list, sync_req);
+	_starpu_mpi_req_list_push_back(&hashlist->list, sync_req);
 	_starpu_mpi_sync_data_handle_hashmap_count ++;
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_sync_data_handle_mutex);
 #ifdef STARPU_VERBOSE

+ 4 - 1
mpi/src/starpu_mpi_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
- * Copyright (C) 2011-2015  Université de Bordeaux
+ * Copyright (C) 2011-2015, 2017  Université de Bordeaux
  * Copyright (C) 2014 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -76,6 +76,9 @@ starpu_data_handle_t _starpu_mpi_tag_get_data_handle_from_tag(int tag)
 void _starpu_mpi_tag_data_register(starpu_data_handle_t handle, int tag)
 {
 	struct handle_tag_entry *entry;
+	if (tag == -1)
+		/* No tag for this data, probably a temporary data not to be communicated */
+		return;
 	_STARPU_MPI_MALLOC(entry, sizeof(*entry));
 
 	STARPU_ASSERT_MSG(!(_starpu_mpi_tag_get_data_handle_from_tag(tag)),

+ 78 - 38
mpi/src/starpu_mpi_task_insert.c

@@ -30,11 +30,15 @@
 #include <starpu_mpi_cache.h>
 #include <starpu_mpi_select_node.h>
 
-#define _SEND_DATA(data, mode, dest, data_tag, comm, callback, arg)     \
-	if (mode & STARPU_SSEND)					\
-		starpu_mpi_issend_detached(data, dest, data_tag, comm, callback, arg); \
-	else								\
-		starpu_mpi_isend_detached(data, dest, data_tag, comm, callback, arg);
+#include "starpu_mpi_task_insert.h"
+
+#define _SEND_DATA(data, mode, dest, data_tag, prio, comm, callback, arg)     \
+	do {									\
+		if (mode & STARPU_SSEND)					\
+			starpu_mpi_issend_detached_prio(data, dest, data_tag, prio, comm, callback, arg); 	\
+		else												\
+			starpu_mpi_isend_detached_prio(data, dest, data_tag, prio, comm, callback, arg);	\
+	} while (0)
 
 static void (*pre_submit_hook)(struct starpu_task *task) = NULL;
 
@@ -80,11 +84,11 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_a
 			// No node has been selected yet
 			*xrank = mpi_rank;
 			_STARPU_MPI_DEBUG(100, "Codelet is going to be executed by node %d\n", *xrank);
-			*do_execute = (mpi_rank == me);
+			*do_execute = mpi_rank == STARPU_MPI_PER_NODE || (mpi_rank == me);
 		}
 		else if (mpi_rank != *xrank)
 		{
-			_STARPU_MPI_DEBUG(100, "Another node %d had already been selected to execute the codelet\n", *xrank);
+			_STARPU_MPI_DEBUG(100, "Another node %d had already been selected to execute the codelet, can't now set %d\n", *xrank, mpi_rank);
 			*inconsistent_execute = 1;
 		}
 	}
@@ -92,8 +96,12 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_a
 	return 0;
 }
 
-void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm)
+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
 {
+	if (data && xrank == STARPU_MPI_PER_NODE)
+	{
+		STARPU_ASSERT_MSG(starpu_mpi_data_get_rank(data) == STARPU_MPI_PER_NODE, "If task is replicated, it has to access only per-node data");
+	}
 	if (data && mode & STARPU_R)
 	{
 		int mpi_rank = starpu_mpi_data_get_rank(data);
@@ -103,7 +111,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 		}
 
-		if (do_execute && mpi_rank != me)
+		if (do_execute && mpi_rank != STARPU_MPI_PER_NODE && mpi_rank != me)
 		{
 			/* The node is going to execute the codelet, but it does not own the data, it needs to receive the data from the owner node */
 			int already_received = _starpu_mpi_cache_received_data_set(data);
@@ -126,7 +134,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 				if (data_tag == -1)
 					_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 				_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data, xrank);
-				_SEND_DATA(data, mode, xrank, data_tag, comm, NULL, NULL);
+				_SEND_DATA(data, mode, xrank, data_tag, prio, comm, NULL, NULL);
 			}
 			// Else the data has already been sent
 		}
@@ -134,7 +142,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 }
 
 static
-void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm)
+void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
 {
 	if (mode & STARPU_W)
 	{
@@ -144,9 +152,13 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 		{
 			_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 		}
+		if (mpi_rank == STARPU_MPI_PER_NODE)
+		{
+			mpi_rank = me;
+		}
 		if (mpi_rank == me)
 		{
-			if (xrank != -1 && me != xrank)
+			if (xrank != -1 && (xrank != STARPU_MPI_PER_NODE && me != xrank))
 			{
 				_STARPU_MPI_DEBUG(1, "Receive data %p back from the task %d which executed the codelet ...\n", data, xrank);
 				if(data_tag == -1)
@@ -159,7 +171,7 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 			if(data_tag == -1)
 				_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 			_STARPU_MPI_DEBUG(1, "Send data %p back to its owner %d...\n", data, mpi_rank);
-			_SEND_DATA(data, mode, mpi_rank, data_tag, comm, NULL, NULL);
+			_SEND_DATA(data, mode, mpi_rank, data_tag, prio, comm, NULL, NULL);
 		}
 	}
 }
@@ -182,6 +194,10 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 		if ((mode & STARPU_R) && do_execute)
 		{
 			int mpi_rank = starpu_mpi_data_get_rank(data);
+			if (mpi_rank == STARPU_MPI_PER_NODE)
+			{
+				mpi_rank = me;
+			}
 			if (mpi_rank != me && mpi_rank != -1)
 			{
 				starpu_data_invalidate_submit(data);
@@ -191,8 +207,9 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 }
 
 static
-int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, va_list varg_list)
+int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, va_list varg_list)
 {
+	/* XXX: _fstarpu_mpi_task_decode_v needs to be updated at the same time */
 	va_list varg_list_copy;
 	int inconsistent_execute = 0;
 	int arg_type;
@@ -200,6 +217,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 	int nb_allocated_data = 16;
 	struct starpu_data_descr *descrs;
 	int nb_data;
+	int prio = 0;
 	int select_node_policy = STARPU_MPI_NODE_SELECTION_CURRENT_POLICY;
 
 	_STARPU_TRACE_TASK_MPI_DECODE_START();
@@ -348,7 +366,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
-			(void)va_arg(varg_list_copy, int);
+			prio = va_arg(varg_list_copy, int);
 		}
 		/* STARPU_EXECUTE_ON_NODE handled above */
 		/* STARPU_EXECUTE_ON_DATA handled above */
@@ -431,32 +449,34 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 	if (inconsistent_execute == 1 || *xrank == -1)
 	{
 		// We need to find out which node is going to execute the codelet.
-		_STARPU_MPI_DISP("Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
+		_STARPU_MPI_DEBUG(100, "Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
 		*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
-		*do_execute = (me == *xrank);
+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
 	}
 	else
 	{
 		_STARPU_MPI_DEBUG(100, "Inconsistent=%d - xrank=%d\n", inconsistent_execute, *xrank);
-		*do_execute = (me == *xrank);
+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
 	}
 	_STARPU_MPI_DEBUG(100, "do_execute=%d\n", *do_execute);
 
 	*descrs_p = descrs;
 	*nb_data_p = nb_data;
+	*prio_p = prio;
 
 	_STARPU_TRACE_TASK_MPI_DECODE_END();
 	return 0;
 }
 
 static
-int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, va_list varg_list)
+int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, va_list varg_list)
 {
 	int me, do_execute, xrank, nb_nodes;
 	int ret;
 	int i;
 	struct starpu_data_descr *descrs;
 	int nb_data;
+	int prio;
 
 	_STARPU_MPI_LOG_IN();
 
@@ -464,25 +484,36 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	starpu_mpi_comm_size(comm, &nb_nodes);
 
 	/* Find out whether we are to execute the data because we own the data to be written to. */
-	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
-	if (ret < 0) return ret;
+	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, varg_list);
+	if (ret < 0)
+		return ret;
 
 	_STARPU_TRACE_TASK_MPI_PRE_START();
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
-		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
+		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 	}
 
-	if (xrank_p) *xrank_p = xrank;
-	if (nb_data_p) *nb_data_p = nb_data;
+	if (xrank_p)
+		*xrank_p = xrank;
+	if (nb_data_p)
+		*nb_data_p = nb_data;
+	if (prio_p)
+		*prio_p = prio;
+
 	if (descrs_p)
 		*descrs_p = descrs;
 	else
 		free(descrs);
+
+
 	_STARPU_TRACE_TASK_MPI_PRE_END();
 
-	if (do_execute == 0) return 1;
+	if (do_execute == 0)
+	{
+		return 1;
+	}
 	else
 	{
 		va_list varg_list_copy;
@@ -499,7 +530,7 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	}
 }
 
-int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data)
+int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data, int prio)
 {
 	int me, i;
 
@@ -508,7 +539,7 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 
 	for(i=0 ; i<nb_data ; i++)
 	{
-		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
+		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
 	}
 
@@ -528,9 +559,11 @@ int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_
 	int do_execute = 0;
 	struct starpu_data_descr *descrs;
 	int nb_data;
+	int prio;
 
-	ret = _starpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, varg_list);
-	if (ret < 0) return ret;
+	ret = _starpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, &prio, varg_list);
+	if (ret < 0)
+		return ret;
 
 	if (ret == 0)
 	{
@@ -550,7 +583,7 @@ int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_
 		}
 	}
 
-	int val = _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
+	int val = _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
 
 	if (ret == 0 && pre_submit_hook)
 		pre_submit_hook(task);
@@ -587,10 +620,10 @@ struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *
 	int ret;
 
 	va_start(varg_list, codelet);
-	ret = _starpu_mpi_task_build_v(comm, codelet, &task, NULL, NULL, NULL, varg_list);
+	ret = _starpu_mpi_task_build_v(comm, codelet, &task, NULL, NULL, NULL, NULL, varg_list);
 	va_end(varg_list);
 	STARPU_ASSERT(ret >= 0);
-	if (ret > 0) return NULL; else return task;
+	return (ret > 0) ? NULL : task;
 }
 
 int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
@@ -600,17 +633,19 @@ int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ..
 	va_list varg_list;
 	struct starpu_data_descr *descrs;
 	int nb_data;
+	int prio;
 
 	starpu_mpi_comm_rank(comm, &me);
 	starpu_mpi_comm_size(comm, &nb_nodes);
 
 	va_start(varg_list, codelet);
 	/* Find out whether we are to execute the data because we own the data to be written to. */
-	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
+	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, varg_list);
 	va_end(varg_list);
-	if (ret < 0) return ret;
+	if (ret < 0)
+		return ret;
 
-	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
+	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
 }
 
 struct _starpu_mpi_redux_data_args
@@ -685,7 +720,7 @@ void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
 
 /* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
  * a data previously accessed in REDUX mode gets accessed in R mode. */
-void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
+void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
 {
 	int me, rank, tag, nb_nodes;
 
@@ -747,7 +782,8 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 				args->taskB->cl = args->data_handle->redux_cl;
 				args->taskB->sequential_consistency = 0;
 				STARPU_TASK_SET_HANDLE(args->taskB, args->data_handle, 0);
-				taskBs[j] = args->taskB; j++;
+				taskBs[j] = args->taskB;
+				j++;
 
 				// Submit taskA
 				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
@@ -768,7 +804,7 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 	else
 	{
 		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
-		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
+		starpu_mpi_isend_detached_prio(data_handle, rank, tag, prio, comm, NULL, NULL);
 		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
 	}
 	/* FIXME: In order to prevent simultaneous receive submissions
@@ -779,3 +815,7 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 	starpu_task_wait_for_all();
 
 }
+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+	return starpu_mpi_redux_data_prio(comm, data_handle, 0);
+}

+ 2 - 2
mpi/src/starpu_mpi_task_insert.h

@@ -23,8 +23,8 @@ extern "C"
 #endif
 
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank);
-void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm);
-int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data);
+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm);
+int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data, int prio);
 
 #ifdef __cplusplus
 }

+ 35 - 18
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -25,7 +25,7 @@
 
 #ifdef HAVE_MPI_COMM_F2C
 static
-int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, void **arglist)
+int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, void **arglist)
 {
 	int arg_i = 0;
 	int inconsistent_execute = 0;
@@ -33,6 +33,7 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 	int nb_allocated_data = 16;
 	struct starpu_data_descr *descrs;
 	int nb_data;
+	int prio = 0;
 	int select_node_policy = STARPU_MPI_NODE_SELECTION_CURRENT_POLICY;
 
 	_STARPU_TRACE_TASK_MPI_DECODE_START();
@@ -194,6 +195,7 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
+			prio = *(int *)arglist[arg_i];
 			arg_i++;
 			/* int* */
 		}
@@ -289,30 +291,32 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 		// We need to find out which node is going to execute the codelet.
 		_STARPU_MPI_DISP("Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
 		*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
-		*do_execute = (me == *xrank);
+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
 	}
 	else
 	{
 		_STARPU_MPI_DEBUG(100, "Inconsistent=%d - xrank=%d\n", inconsistent_execute, *xrank);
-		*do_execute = (me == *xrank);
+		*do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
 	}
 	_STARPU_MPI_DEBUG(100, "do_execute=%d\n", *do_execute);
 
 	*descrs_p = descrs;
 	*nb_data_p = nb_data;
+	*prio_p = prio;
 
 	_STARPU_TRACE_TASK_MPI_DECODE_END();
 	return 0;
 }
 
 static
-int _fstarpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, void **arglist)
+int _fstarpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, void **arglist)
 {
 	int me, do_execute, xrank, nb_nodes;
 	int ret;
 	int i;
 	struct starpu_data_descr *descrs;
 	int nb_data;
+	int prio;
 
 	_STARPU_MPI_LOG_IN();
 
@@ -320,25 +324,34 @@ int _fstarpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, str
 	starpu_mpi_comm_size(comm, &nb_nodes);
 
 	/* Find out whether we are to execute the data because we own the data to be written to. */
-	ret = _fstarpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, arglist);
-	if (ret < 0) return ret;
+	ret = _fstarpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, arglist);
+	if (ret < 0)
+		return ret;
 
 	_STARPU_TRACE_TASK_MPI_PRE_START();
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
-		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
+		_starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 	}
 
-	if (xrank_p) *xrank_p = xrank;
-	if (nb_data_p) *nb_data_p = nb_data;
+	if (xrank_p)
+		*xrank_p = xrank;
+	if (nb_data_p)
+		*nb_data_p = nb_data;
+	if (prio_p)
+		*prio_p = prio;
+
 	if (descrs_p)
 		*descrs_p = descrs;
 	else
 		free(descrs);
 	_STARPU_TRACE_TASK_MPI_PRE_END();
 
-	if (do_execute == 0) return 1;
+	if (do_execute == 0)
+	{
+		return 1;
+	}
 	else
 	{
 		_STARPU_MPI_DEBUG(100, "Execution of the codelet %p (%s)\n", codelet, codelet?codelet->name:NULL);
@@ -360,9 +373,11 @@ int _fstarpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, vo
 	int do_execute = 0;
 	struct starpu_data_descr *descrs;
 	int nb_data;
+	int prio;
 
-	ret = _fstarpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, arglist);
-	if (ret < 0) return ret;
+	ret = _fstarpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, &prio, arglist);
+	if (ret < 0)
+		return ret;
 
 	if (ret == 0)
 	{
@@ -381,7 +396,7 @@ int _fstarpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, vo
 			starpu_task_destroy(task);
 		}
 	}
-	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
+	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
 }
 
 int fstarpu_mpi_task_insert(MPI_Fint comm, void ***_arglist)
@@ -411,9 +426,9 @@ struct starpu_task *fstarpu_mpi_task_build(MPI_Fint comm, void ***_arglist)
 	struct starpu_task *task;
 	int ret;
 
-	ret = _fstarpu_mpi_task_build_v(MPI_Comm_f2c(comm), codelet, &task, NULL, NULL, NULL, arglist+1);
+	ret = _fstarpu_mpi_task_build_v(MPI_Comm_f2c(comm), codelet, &task, NULL, NULL, NULL, NULL, arglist+1);
 	STARPU_ASSERT(ret >= 0);
-	if (ret > 0) return NULL; else return task;
+	return (ret > 0) ? NULL : task;
 }
 
 int fstarpu_mpi_task_post_build(MPI_Fint _comm, void ***_arglist)
@@ -429,15 +444,17 @@ int fstarpu_mpi_task_post_build(MPI_Fint _comm, void ***_arglist)
 	int ret, me, nb_nodes;
 	struct starpu_data_descr *descrs;
 	int nb_data;
+	int prio;
 
 	starpu_mpi_comm_rank(comm, &me);
 	starpu_mpi_comm_size(comm, &nb_nodes);
 
 	/* Find out whether we are to execute the data because we own the data to be written to. */
-	ret = _fstarpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, arglist);
-	if (ret < 0) return ret;
+	ret = _fstarpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, arglist);
+	if (ret < 0)
+		return ret;
 
-	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
+	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
 }
 
 #endif /* HAVE_MPI_COMM_F2C */

+ 4 - 107
mpi/tests/Makefile.am

@@ -91,7 +91,7 @@ endif
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
@@ -126,7 +126,8 @@ starpu_mpi_TESTS +=				\
 	policy_register_many			\
 	policy_selection			\
 	policy_selection2			\
-	ring_async_implicit
+	ring_async_implicit			\
+	temporary
 
 if !STARPU_SIMGRID
 starpu_mpi_TESTS +=				\
@@ -182,6 +183,7 @@ noinst_PROGRAMS =				\
 	ring_sync_detached			\
 	ring_async				\
 	ring_async_implicit			\
+	temporary				\
 	block_interface				\
 	block_interface_pinned			\
 	cache					\
@@ -224,111 +226,6 @@ XFAIL_TESTS=					\
 	policy_unregister			\
 	starpu_redefine
 
-mpi_isend_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_earlyrecv_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_earlyrecv2_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_earlyrecv2_sync_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_irecv_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_isend_detached_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_irecv_detached_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_detached_tag_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_redux_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-datatypes_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-pingpong_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_test_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-ring_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-ring_sync_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-ring_sync_detached_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-ring_async_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-ring_async_implicit_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-block_interface_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-block_interface_pinned_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-cache_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-cache_disable_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-callback_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-matrix_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-matrix2_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_compute_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_sent_cache_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_recv_cache_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_block_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_owner_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_owner2_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_owner_data_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_node_choice_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_count_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-insert_task_dyn_handles_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-multiple_send_LDADD =				\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_scatter_gather_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_reduction_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-user_defined_datatype_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-tags_checking_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-sync_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-gather_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-gather2_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-policy_register_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-policy_register_many_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-policy_register_toomany_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-policy_unregister_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-policy_selection_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-policy_selection2_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-early_request_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-starpu_redefine_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-load_balancer_LDADD =					\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-
 ring_SOURCES = ring.c
 ring_sync_SOURCES = ring_sync.c
 ring_sync_detached_SOURCES = ring_sync_detached.c

+ 1 - 1
mpi/tests/block_interface.c

@@ -56,7 +56,7 @@ int main(int argc, char **argv)
 	 * their blocks. */
 
 	float *block = NULL;
-	starpu_data_handle_t block_handle;
+	starpu_data_handle_t block_handle = NULL;
 
 	if (rank == 0)
 	{

+ 2 - 2
mpi/tests/block_interface_pinned.c

@@ -56,8 +56,8 @@ int main(int argc, char **argv)
 	 * register it directly. Node 0 and 1 will then exchange the content of
 	 * their blocks. */
 
-	float *block;
-	starpu_data_handle_t block_handle;
+	float *block = NULL;
+	starpu_data_handle_t block_handle = NULL;
 
 	if (rank == 0)
 	{

+ 2 - 1
mpi/tests/cache.c

@@ -94,7 +94,8 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 
-	if (starpu_mpi_cache_is_enabled() == 0) goto skip;
+	if (starpu_mpi_cache_is_enabled() == 0)
+		goto skip;
 
 	if (rank == 0)
 		starpu_variable_data_register(&data, STARPU_MAIN_RAM, (uintptr_t)&val, sizeof(unsigned));

+ 2 - 1
mpi/tests/cache_disable.c

@@ -58,7 +58,8 @@ int main(int argc, char **argv)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 
 	cache = starpu_mpi_cache_is_enabled();
-	if (cache == 0) goto skip;
+	if (cache == 0)
+		goto skip;
 
 	val = malloc(sizeof(*val));
 	*val = 12;

+ 11 - 6
mpi/tests/callback.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013, 2014, 2015  CNRS
+ * Copyright (C) 2013, 2014, 2015, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -72,7 +72,8 @@ int main(int argc, char **argv)
 	int rank, size;
 
 	ret = starpu_initialize(NULL, &argc, &argv);
-	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	if (ret == -ENODEV)
+		return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	ret = starpu_mpi_init(&argc, &argv, 1);
@@ -87,7 +88,8 @@ int main(int argc, char **argv)
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 
-	if (rank == 0) expected_x ++;
+	if (rank == 0)
+		expected_x ++;
 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD,
 				     NULL,
 				     STARPU_EXECUTE_ON_NODE, 0,
@@ -96,7 +98,8 @@ int main(int argc, char **argv)
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	if (rank == 0) expected_x ++;
+	if (rank == 0)
+		expected_x ++;
 	STARPU_ASSERT_MSG(x == expected_x, "x should be equal to %d and not %d\n", expected_x, x);
 
 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD,
@@ -107,7 +110,8 @@ int main(int argc, char **argv)
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	if (rank == 0) expected_y ++;
+	if (rank == 0)
+		expected_y ++;
 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD,
 				     &my_codelet,
 				     STARPU_EXECUTE_ON_NODE, 0,
@@ -117,7 +121,8 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 	starpu_task_wait_for_all();
-	if (rank == 0) expected_y ++;
+	if (rank == 0)
+		expected_y ++;
 	STARPU_ASSERT_MSG(y == expected_y, "y should be equal to %d and not %d\n", expected_y, y);
 
 	starpu_mpi_shutdown();

+ 327 - 180
mpi/tests/datatypes.c

@@ -20,178 +20,6 @@
 
 typedef void (*check_func)(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error);
 
-void check_void(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
-{
-	FPRINTF_MPI(stderr, "Success with void value\n");
-}
-
-void check_variable(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
-{
-	float *v_s, *v_r;
-
-	STARPU_ASSERT(starpu_variable_get_elemsize(handle_s) == starpu_variable_get_elemsize(handle_r));
-
-	v_s = (float *)starpu_variable_get_local_ptr(handle_s);
-	v_r = (float *)starpu_variable_get_local_ptr(handle_r);
-
-	if (*v_s == *v_r)
-	{
-		FPRINTF_MPI(stderr, "Success with variable value: %f == %f\n", *v_s, *v_r);
-	}
-	else
-	{
-		*error = 1;
-		FPRINTF_MPI(stderr, "Error with variable value: %f != %f\n", *v_s, *v_r);
-	}
-}
-
-void check_vector(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
-{
-	int i;
-	int nx;
-	int *v_r, *v_s;
-
-	STARPU_ASSERT(starpu_vector_get_elemsize(handle_s) == starpu_vector_get_elemsize(handle_r));
-	STARPU_ASSERT(starpu_vector_get_nx(handle_s) == starpu_vector_get_nx(handle_r));
-
-	nx = starpu_vector_get_nx(handle_r);
-	v_r = (int *)starpu_vector_get_local_ptr(handle_r);
-	v_s = (int *)starpu_vector_get_local_ptr(handle_s);
-
-	for(i=0 ; i<nx ; i++)
-	{
-		if (v_s[i] == v_r[i])
-		{
-			FPRINTF_MPI(stderr, "Success with vector[%d] value: %d == %d\n", i, v_s[i], v_r[i]);
-		}
-		else
-		{
-			*error = 1;
-			FPRINTF_MPI(stderr, "Error with vector[%d] value: %d != %d\n", i, v_s[i], v_r[i]);
-		}
-	}
-}
-
-void check_matrix(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
-{
-	STARPU_ASSERT(starpu_matrix_get_elemsize(handle_s) == starpu_matrix_get_elemsize(handle_r));
-	STARPU_ASSERT(starpu_matrix_get_nx(handle_s) == starpu_matrix_get_nx(handle_r));
-	STARPU_ASSERT(starpu_matrix_get_ny(handle_s) == starpu_matrix_get_ny(handle_r));
-	STARPU_ASSERT(starpu_matrix_get_local_ld(handle_s) == starpu_matrix_get_local_ld(handle_r));
-
-	char *matrix_s = (char *)starpu_matrix_get_local_ptr(handle_s);
-	char *matrix_r = (char *)starpu_matrix_get_local_ptr(handle_r);
-
-	int nx = starpu_matrix_get_nx(handle_s);
-	int ny = starpu_matrix_get_ny(handle_s);
-	int ldy = starpu_matrix_get_local_ld(handle_s);
-
-	int x, y;
-
-	for(y=0 ; y<ny ; y++)
-	{
-		for(x=0 ; x<nx ; x++)
-		{
-			int index=(y*ldy)+x;
-			if (matrix_s[index] == matrix_r[index])
-			{
-				FPRINTF_MPI(stderr, "Success with matrix[%d,%d --> %d] value: %c == %c\n", x, y, index, matrix_s[index], matrix_r[index]);
-			}
-			else
-			{
-				*error = 1;
-				FPRINTF_MPI(stderr, "Error with matrix[%d,%d --> %d] value: %c != %c\n", x, y, index, matrix_s[index], matrix_r[index]);
-			}
-		}
-	}
-}
-
-void check_block(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
-{
-	STARPU_ASSERT(starpu_block_get_elemsize(handle_s) == starpu_block_get_elemsize(handle_r));
-	STARPU_ASSERT(starpu_block_get_nx(handle_s) == starpu_block_get_nx(handle_r));
-	STARPU_ASSERT(starpu_block_get_ny(handle_s) == starpu_block_get_ny(handle_r));
-	STARPU_ASSERT(starpu_block_get_nz(handle_s) == starpu_block_get_nz(handle_r));
-	STARPU_ASSERT(starpu_block_get_local_ldy(handle_s) == starpu_block_get_local_ldy(handle_r));
-	STARPU_ASSERT(starpu_block_get_local_ldz(handle_s) == starpu_block_get_local_ldz(handle_r));
-
-	starpu_data_acquire(handle_s, STARPU_R);
-	starpu_data_acquire(handle_r, STARPU_R);
-
-	float *block_s = (float *)starpu_block_get_local_ptr(handle_s);
-	float *block_r = (float *)starpu_block_get_local_ptr(handle_r);
-
-	int nx = starpu_block_get_nx(handle_s);
-	int ny = starpu_block_get_ny(handle_s);
-	int nz = starpu_block_get_nz(handle_s);
-
-	int ldy = starpu_block_get_local_ldy(handle_s);
-	int ldz = starpu_block_get_local_ldz(handle_s);
-
-	int x, y, z;
-
-	for(z=0 ; z<nz ; z++)
-	{
-		for(y=0 ; y<ny ; y++)
-			for(x=0 ; x<nx ; x++)
-			{
-				int index=(z*ldz)+(y*ldy)+x;
-				if (block_s[index] == block_r[index])
-				{
-					FPRINTF_MPI(stderr, "Success with block[%d,%d,%d --> %d] value: %f == %f\n", x, y, z, index, block_s[index], block_r[index]);
-				}
-				else
-				{
-					*error = 1;
-					FPRINTF_MPI(stderr, "Error with block[%d,%d,%d --> %d] value: %f != %f\n", x, y, z, index, block_s[index], block_r[index]);
-				}
-			}
-	}
-
-	starpu_data_release(handle_s);
-	starpu_data_release(handle_r);
-}
-
-void check_bcsr(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
-{
-	STARPU_ASSERT(starpu_bcsr_get_elemsize(handle_s) == starpu_bcsr_get_elemsize(handle_r));
-	STARPU_ASSERT(starpu_bcsr_get_nnz(handle_s) == starpu_bcsr_get_nnz(handle_r));
-	STARPU_ASSERT(starpu_bcsr_get_nrow(handle_s) == starpu_bcsr_get_nrow(handle_r));
-	STARPU_ASSERT(starpu_bcsr_get_firstentry(handle_s) == starpu_bcsr_get_firstentry(handle_r));
-	STARPU_ASSERT(starpu_bcsr_get_r(handle_s) == starpu_bcsr_get_r(handle_r));
-	STARPU_ASSERT(starpu_bcsr_get_c(handle_s) == starpu_bcsr_get_c(handle_r));
-	//	STARPU_ASSERT(starpu_bcsr_get_local_colind(handle_s) == starpu_bcsr_get_local_colind(handle_r));
-	//	STARPU_ASSERT(starpu_bcsr_get_local_rowptr(handle_s) == starpu_bcsr_get_local_rowptr(handle_r));
-
-	starpu_data_acquire(handle_s, STARPU_R);
-	starpu_data_acquire(handle_r, STARPU_R);
-
-	int *bcsr_s = (int *)starpu_bcsr_get_local_nzval(handle_s);
-	int *bcsr_r = (int *)starpu_bcsr_get_local_nzval(handle_r);
-
-	int r = starpu_bcsr_get_r(handle_s);
-	int c = starpu_bcsr_get_c(handle_s);
-	int nnz = starpu_bcsr_get_nnz(handle_s);
-
-	int x;
-
-	for(x=0 ; x<r*c*nnz ; x++)
-	{
-		if (bcsr_s[x] == bcsr_r[x])
-		{
-			FPRINTF_MPI(stderr, "Success with bcsr[%d] value: %d == %d\n", x, bcsr_s[x], bcsr_r[x]);
-		}
-		else
-		{
-			*error = 1;
-			FPRINTF_MPI(stderr, "Error with bcsr[%d] value: %d != %d\n", x, bcsr_s[x], bcsr_r[x]);
-		}
-	}
-
-	starpu_data_release(handle_s);
-	starpu_data_release(handle_r);
-}
-
 void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int tag_s, starpu_data_handle_t handle_r, int tag_r, int *error, check_func func)
 {
 	int ret;
@@ -216,6 +44,14 @@ void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int
 	}
 }
 
+/*
+ * Void
+ */
+void check_void(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	FPRINTF_MPI(stderr, "Success with void value\n");
+}
+
 void exchange_void(int rank, int *error)
 {
 	STARPU_SKIP_IF_VALGRIND;
@@ -240,6 +76,29 @@ void exchange_void(int rank, int *error)
 	}
 }
 
+/*
+ * Variable
+ */
+void check_variable(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	float *v_s, *v_r;
+
+	STARPU_ASSERT(starpu_variable_get_elemsize(handle_s) == starpu_variable_get_elemsize(handle_r));
+
+	v_s = (float *)starpu_variable_get_local_ptr(handle_s);
+	v_r = (float *)starpu_variable_get_local_ptr(handle_r);
+
+	if (*v_s == *v_r)
+	{
+		FPRINTF_MPI(stderr, "Success with variable value: %f == %f\n", *v_s, *v_r);
+	}
+	else
+	{
+		*error = 1;
+		FPRINTF_MPI(stderr, "Error with variable value: %f != %f\n", *v_s, *v_r);
+	}
+}
+
 void exchange_variable(int rank, int *error)
 {
 	if (rank == 0)
@@ -263,6 +122,36 @@ void exchange_variable(int rank, int *error)
 	}
 }
 
+/*
+ * Vector
+ */
+void check_vector(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	int i;
+	int nx;
+	int *v_r, *v_s;
+
+	STARPU_ASSERT(starpu_vector_get_elemsize(handle_s) == starpu_vector_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_vector_get_nx(handle_s) == starpu_vector_get_nx(handle_r));
+
+	nx = starpu_vector_get_nx(handle_r);
+	v_r = (int *)starpu_vector_get_local_ptr(handle_r);
+	v_s = (int *)starpu_vector_get_local_ptr(handle_s);
+
+	for(i=0 ; i<nx ; i++)
+	{
+		if (v_s[i] == v_r[i])
+		{
+			FPRINTF_MPI(stderr, "Success with vector[%d] value: %d == %d\n", i, v_s[i], v_r[i]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with vector[%d] value: %d != %d\n", i, v_s[i], v_r[i]);
+		}
+	}
+}
+
 void exchange_vector(int rank, int *error)
 {
 	if (rank == 0)
@@ -287,6 +176,43 @@ void exchange_vector(int rank, int *error)
 	}
 }
 
+/*
+ * Matrix
+ */
+void check_matrix(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_matrix_get_elemsize(handle_s) == starpu_matrix_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_nx(handle_s) == starpu_matrix_get_nx(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_ny(handle_s) == starpu_matrix_get_ny(handle_r));
+	STARPU_ASSERT(starpu_matrix_get_local_ld(handle_s) == starpu_matrix_get_local_ld(handle_r));
+
+	char *matrix_s = (char *)starpu_matrix_get_local_ptr(handle_s);
+	char *matrix_r = (char *)starpu_matrix_get_local_ptr(handle_r);
+
+	int nx = starpu_matrix_get_nx(handle_s);
+	int ny = starpu_matrix_get_ny(handle_s);
+	int ldy = starpu_matrix_get_local_ld(handle_s);
+
+	int x, y;
+
+	for(y=0 ; y<ny ; y++)
+	{
+		for(x=0 ; x<nx ; x++)
+		{
+			int index=(y*ldy)+x;
+			if (matrix_s[index] == matrix_r[index])
+			{
+				FPRINTF_MPI(stderr, "Success with matrix[%d,%d --> %d] value: %c == %c\n", x, y, index, matrix_s[index], matrix_r[index]);
+			}
+			else
+			{
+				*error = 1;
+				FPRINTF_MPI(stderr, "Error with matrix[%d,%d --> %d] value: %c != %c\n", x, y, index, matrix_s[index], matrix_r[index]);
+			}
+		}
+	}
+}
+
 void exchange_matrix(int rank, int *error)
 {
 	int nx=3;
@@ -326,6 +252,55 @@ void exchange_matrix(int rank, int *error)
 	}
 }
 
+/*
+ * Block
+ */
+void check_block(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_block_get_elemsize(handle_s) == starpu_block_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_block_get_nx(handle_s) == starpu_block_get_nx(handle_r));
+	STARPU_ASSERT(starpu_block_get_ny(handle_s) == starpu_block_get_ny(handle_r));
+	STARPU_ASSERT(starpu_block_get_nz(handle_s) == starpu_block_get_nz(handle_r));
+	STARPU_ASSERT(starpu_block_get_local_ldy(handle_s) == starpu_block_get_local_ldy(handle_r));
+	STARPU_ASSERT(starpu_block_get_local_ldz(handle_s) == starpu_block_get_local_ldz(handle_r));
+
+	starpu_data_acquire(handle_s, STARPU_R);
+	starpu_data_acquire(handle_r, STARPU_R);
+
+	float *block_s = (float *)starpu_block_get_local_ptr(handle_s);
+	float *block_r = (float *)starpu_block_get_local_ptr(handle_r);
+
+	int nx = starpu_block_get_nx(handle_s);
+	int ny = starpu_block_get_ny(handle_s);
+	int nz = starpu_block_get_nz(handle_s);
+
+	int ldy = starpu_block_get_local_ldy(handle_s);
+	int ldz = starpu_block_get_local_ldz(handle_s);
+
+	int x, y, z;
+
+	for(z=0 ; z<nz ; z++)
+	{
+		for(y=0 ; y<ny ; y++)
+			for(x=0 ; x<nx ; x++)
+			{
+				int index=(z*ldz)+(y*ldy)+x;
+				if (block_s[index] == block_r[index])
+				{
+					FPRINTF_MPI(stderr, "Success with block[%d,%d,%d --> %d] value: %f == %f\n", x, y, z, index, block_s[index], block_r[index]);
+				}
+				else
+				{
+					*error = 1;
+					FPRINTF_MPI(stderr, "Error with block[%d,%d,%d --> %d] value: %f != %f\n", x, y, z, index, block_s[index], block_r[index]);
+				}
+			}
+	}
+
+	starpu_data_release(handle_s);
+	starpu_data_release(handle_r);
+}
+
 void exchange_block(int rank, int *error)
 {
 	int nx=3;
@@ -369,6 +344,79 @@ void exchange_block(int rank, int *error)
 	}
 }
 
+/*
+ * BCSR
+ */
+void check_bcsr(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_bcsr_get_elemsize(handle_s) == starpu_bcsr_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_nnz(handle_s) == starpu_bcsr_get_nnz(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_nrow(handle_s) == starpu_bcsr_get_nrow(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_firstentry(handle_s) == starpu_bcsr_get_firstentry(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_r(handle_s) == starpu_bcsr_get_r(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_c(handle_s) == starpu_bcsr_get_c(handle_r));
+
+	starpu_data_acquire(handle_s, STARPU_R);
+	starpu_data_acquire(handle_r, STARPU_R);
+
+	uint32_t *colind_s = starpu_bcsr_get_local_colind(handle_s);
+	uint32_t *colind_r = starpu_bcsr_get_local_colind(handle_r);
+	uint32_t *rowptr_s = starpu_bcsr_get_local_rowptr(handle_s);
+	uint32_t *rowptr_r = starpu_bcsr_get_local_rowptr(handle_r);
+
+	int *bcsr_s = (int *)starpu_bcsr_get_local_nzval(handle_s);
+	int *bcsr_r = (int *)starpu_bcsr_get_local_nzval(handle_r);
+
+	int r = starpu_bcsr_get_r(handle_s);
+	int c = starpu_bcsr_get_c(handle_s);
+	int nnz = starpu_bcsr_get_nnz(handle_s);
+	int nrows = starpu_bcsr_get_nrow(handle_s);
+
+	int x;
+
+	for(x=0 ; x<nnz ; x++)
+	{
+		if (colind_s[x] == colind_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with colind[%d] value: %u == %u\n", x, colind_s[x], colind_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with colind[%d] value: %u != %u\n", x, colind_s[x], colind_r[x]);
+		}
+	}
+
+	for(x=0 ; x<nrows+1 ; x++)
+	{
+		if (rowptr_s[x] == rowptr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with rowptr[%d] value: %u == %u\n", x, rowptr_s[x], rowptr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with rowptr[%d] value: %u != %u\n", x, rowptr_s[x], rowptr_r[x]);
+		}
+	}
+
+	for(x=0 ; x<r*c*nnz ; x++)
+	{
+		if (bcsr_s[x] == bcsr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with bcsr[%d] value: %d == %d\n", x, bcsr_s[x], bcsr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with bcsr[%d] value: %d != %d\n", x, bcsr_s[x], bcsr_r[x]);
+		}
+	}
+
+	starpu_data_release(handle_s);
+	starpu_data_release(handle_r);
+}
+
 void exchange_bcsr(int rank, int *error)
 {
 	/*
@@ -383,23 +431,22 @@ void exchange_bcsr(int rank, int *error)
 	 *
 	 * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
 	 * colind = [0, 0, 1]
-	 * rowptr = [0, 1 ]
+	 * rowptr = [0, 1, 3]
 	 * r = c = 2
 	 */
 
 	/* Size of the blocks */
 #define BCSR_R 2
 #define BCSR_C 2
-#define BCSR_NROW 2
+#define BCSR_NROWS 2
 #define BCSR_NNZ_BLOCKS 3     /* out of 4 */
 #define BCSR_NZVAL_SIZE (BCSR_R*BCSR_C*BCSR_NNZ_BLOCKS)
 
-	uint32_t colind[BCSR_NNZ_BLOCKS] = {0, 0, 1};
-	uint32_t rowptr[BCSR_NROW] = {0, 1};
-
 	if (rank == 0)
 	{
 		starpu_data_handle_t bcsr_handle[2];
+		uint32_t colind[BCSR_NNZ_BLOCKS] = {0, 0, 1};
+		uint32_t rowptr[BCSR_NROWS+1] = {0, 1, BCSR_NNZ_BLOCKS};
 		int nzval[BCSR_NZVAL_SIZE]  =
 		{
 			0, 1, 2, 3,    /* First block  */
@@ -407,8 +454,8 @@ void exchange_bcsr(int rank, int *error)
 			8, 9, 10, 11   /* Third block  */
 		};
 
-		starpu_bcsr_data_register(&bcsr_handle[0], STARPU_MAIN_RAM, BCSR_NNZ_BLOCKS, BCSR_NROW, (uintptr_t) nzval, colind, rowptr, 0, BCSR_R, BCSR_C, sizeof(nzval[0]));
-		starpu_bcsr_data_register(&bcsr_handle[1], -1, BCSR_NNZ_BLOCKS, BCSR_NROW, (uintptr_t) NULL, colind, rowptr, 0, BCSR_R, BCSR_C, sizeof(nzval[0]));
+		starpu_bcsr_data_register(&bcsr_handle[0], STARPU_MAIN_RAM, BCSR_NNZ_BLOCKS, BCSR_NROWS, (uintptr_t) nzval, colind, rowptr, 0, BCSR_R, BCSR_C, sizeof(nzval[0]));
+		starpu_bcsr_data_register(&bcsr_handle[1], -1, BCSR_NNZ_BLOCKS, BCSR_NROWS, (uintptr_t) NULL, (uint32_t *) NULL, (uint32_t *) NULL, 0, BCSR_R, BCSR_C, sizeof(nzval[0]));
 
 		send_recv_and_check(rank, 1, bcsr_handle[0], 0x73, bcsr_handle[1], 0x8337, error, check_bcsr);
 
@@ -418,12 +465,111 @@ void exchange_bcsr(int rank, int *error)
 	else if (rank == 1)
 	{
 		starpu_data_handle_t bcsr_handle;
-		starpu_bcsr_data_register(&bcsr_handle, -1, BCSR_NNZ_BLOCKS, BCSR_NROW, (uintptr_t) NULL, colind, rowptr, 0, BCSR_R, BCSR_C, sizeof(int));
+		starpu_bcsr_data_register(&bcsr_handle, -1, BCSR_NNZ_BLOCKS, BCSR_NROWS, (uintptr_t) NULL, (uint32_t *) NULL, (uint32_t *) NULL, 0, BCSR_R, BCSR_C, sizeof(int));
 		send_recv_and_check(rank, 0, bcsr_handle, 0x73, NULL, 0x8337, NULL, NULL);
 		starpu_data_unregister(bcsr_handle);
 	}
 }
 
+/*
+ * CSR
+ */
+void check_csr(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_csr_get_elemsize(handle_s) == starpu_csr_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_csr_get_nnz(handle_s) == starpu_csr_get_nnz(handle_r));
+	STARPU_ASSERT(starpu_csr_get_nrow(handle_s) == starpu_csr_get_nrow(handle_r));
+	STARPU_ASSERT(starpu_csr_get_firstentry(handle_s) == starpu_csr_get_firstentry(handle_r));
+
+	starpu_data_acquire(handle_s, STARPU_R);
+	starpu_data_acquire(handle_r, STARPU_R);
+
+	uint32_t *colind_s = starpu_csr_get_local_colind(handle_s);
+	uint32_t *colind_r = starpu_csr_get_local_colind(handle_r);
+	uint32_t *rowptr_s = starpu_csr_get_local_rowptr(handle_s);
+	uint32_t *rowptr_r = starpu_csr_get_local_rowptr(handle_r);
+
+	int *csr_s = (int *)starpu_csr_get_local_nzval(handle_s);
+	int *csr_r = (int *)starpu_csr_get_local_nzval(handle_r);
+
+	int nnz = starpu_csr_get_nnz(handle_s);
+	int nrows = starpu_csr_get_nrow(handle_s);
+
+	int x;
+
+	for(x=0 ; x<nnz ; x++)
+	{
+		if (colind_s[x] == colind_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with colind[%d] value: %u == %u\n", x, colind_s[x], colind_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with colind[%d] value: %u != %u\n", x, colind_s[x], colind_r[x]);
+		}
+	}
+
+	for(x=0 ; x<nrows+1 ; x++)
+	{
+		if (rowptr_s[x] == rowptr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with rowptr[%d] value: %u == %u\n", x, rowptr_s[x], rowptr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with rowptr[%d] value: %u != %u\n", x, rowptr_s[x], rowptr_r[x]);
+		}
+	}
+
+	for(x=0 ; x<nnz ; x++)
+	{
+		if (csr_s[x] == csr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with csr[%d] value: %d == %d\n", x, csr_s[x], csr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with csr[%d] value: %d != %d\n", x, csr_s[x], csr_r[x]);
+		}
+	}
+
+	starpu_data_release(handle_s);
+	starpu_data_release(handle_r);
+}
+
+void exchange_csr(int rank, int *error)
+{
+	// the values are completely wrong, we just want to test that the communication is done correctly
+#define CSR_NROWS 2
+#define CSR_NNZ   5
+
+	if (rank == 0)
+	{
+		starpu_data_handle_t csr_handle[2];
+		uint32_t colind[CSR_NNZ] = {0, 1, 2, 3, 4};
+		uint32_t rowptr[CSR_NROWS+1] = {0, 1, CSR_NNZ};
+		int nzval[CSR_NNZ] = { 11, 22, 33, 44, 55 };
+
+		starpu_csr_data_register(&csr_handle[0], STARPU_MAIN_RAM, CSR_NNZ, CSR_NROWS, (uintptr_t) nzval, colind, rowptr, 0, sizeof(nzval[0]));
+		starpu_csr_data_register(&csr_handle[1], -1, CSR_NNZ, CSR_NROWS, (uintptr_t) NULL, (uint32_t *) NULL, (uint32_t *) NULL, 0, sizeof(nzval[0]));
+
+		send_recv_and_check(rank, 1, csr_handle[0], 0x84, csr_handle[1], 0x8765, error, check_csr);
+
+		starpu_data_unregister(csr_handle[0]);
+		starpu_data_unregister(csr_handle[1]);
+	}
+	else if (rank == 1)
+	{
+		starpu_data_handle_t csr_handle;
+		starpu_csr_data_register(&csr_handle, -1, CSR_NNZ, CSR_NROWS, (uintptr_t) NULL, (uint32_t *) NULL, (uint32_t *) NULL, 0, sizeof(int));
+		send_recv_and_check(rank, 0, csr_handle, 0x84, NULL, 0x8765, NULL, NULL);
+		starpu_data_unregister(csr_handle);
+	}
+}
+
 int main(int argc, char **argv)
 {
 	int ret, rank, size;
@@ -458,6 +604,7 @@ int main(int argc, char **argv)
 	exchange_matrix(rank, &error);
 	exchange_block(rank, &error);
 	exchange_bcsr(rank, &error);
+	exchange_csr(rank, &error);
 
 	starpu_mpi_shutdown();
 	starpu_shutdown();

+ 0 - 0
mpi/tests/insert_task_compute.c


Some files were not shown because too many files changed in this diff