瀏覽代碼

Merge remote-tracking branch 'gitlab/master' into ft_checkpoint

Romain LION 4 年之前
父節點
當前提交
dd612b3ad2
共有 86 個文件被更改,包括 740 次插入337 次删除
  1. 38 51
      configure.ac
  2. 4 3
      doc/doxygen/chapters/370_online_performance_tools.doxy
  3. 4 4
      doc/doxygen/chapters/510_configure_options.doxy
  4. 65 0
      doc/doxygen_dev/dev/starpu_check_missing.sh
  5. 26 1
      doc/doxygen_dev/doxygen-config.cfg.in
  6. 2 0
      doc/doxygen_dev/doxygen.cfg
  7. 30 1
      doc/doxygen_dev/refman.tex
  8. 8 5
      examples/cholesky/cholesky_implicit.c
  9. 3 2
      min-dgels/Makefile.in
  10. 8 0
      min-dgels/base/F2CLIBS/libf2c/Makefile
  11. 0 1
      mpi/examples/Makefile.am
  12. 35 11
      mpi/examples/benchs/abstract_sendrecv_bench.c
  13. 1 1
      mpi/examples/benchs/abstract_sendrecv_bench.h
  14. 17 12
      mpi/examples/benchs/burst_helper.c
  15. 10 5
      mpi/examples/benchs/gemm_helper.c
  16. 23 7
      mpi/examples/benchs/sendrecv_bench.c
  17. 1 1
      mpi/examples/benchs/sendrecv_gemm_bench.c
  18. 10 7
      mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
  19. 39 30
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  20. 6 4
      mpi/src/load_balancer/policy/data_movements_interface.h
  21. 4 2
      mpi/src/load_balancer/policy/load_balancer_policy.h
  22. 11 9
      mpi/src/load_balancer/policy/load_data_interface.h
  23. 2 0
      mpi/src/mpi/starpu_mpi_comm.h
  24. 2 0
      mpi/src/mpi/starpu_mpi_driver.h
  25. 2 0
      mpi/src/mpi/starpu_mpi_early_data.h
  26. 2 0
      mpi/src/mpi/starpu_mpi_early_request.h
  27. 2 0
      mpi/src/mpi/starpu_mpi_mpi.h
  28. 3 1
      mpi/src/mpi/starpu_mpi_mpi_backend.h
  29. 2 0
      mpi/src/mpi/starpu_mpi_sync_data.h
  30. 2 0
      mpi/src/mpi/starpu_mpi_tag.h
  31. 2 0
      mpi/src/nmad/starpu_mpi_nmad.h
  32. 3 1
      mpi/src/nmad/starpu_mpi_nmad_backend.h
  33. 2 0
      mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.h
  34. 2 0
      mpi/src/starpu_mpi_cache.h
  35. 2 0
      mpi/src/starpu_mpi_cache_stats.h
  36. 2 0
      mpi/src/starpu_mpi_datatype.h
  37. 2 0
      mpi/src/starpu_mpi_fxt.h
  38. 2 0
      mpi/src/starpu_mpi_init.h
  39. 8 3
      mpi/src/starpu_mpi_private.h
  40. 2 0
      mpi/src/starpu_mpi_select_node.h
  41. 2 0
      mpi/src/starpu_mpi_stats.h
  42. 2 0
      mpi/src/starpu_mpi_task_insert.h
  43. 5 3
      mpi/tests/pingpong.c
  44. 1 1
      src/common/fxt.h
  45. 29 14
      src/common/graph.h
  46. 4 6
      src/common/knobs.h
  47. 2 0
      src/common/list.h
  48. 4 5
      src/common/timing.h
  49. 2 0
      src/common/uthash.h
  50. 1 1
      src/common/utils.h
  51. 5 5
      src/core/debug.h
  52. 14 9
      src/core/dependencies/cg.h
  53. 14 12
      src/core/dependencies/tags.h
  54. 3 0
      src/core/perfmodel/multiple_regression.c
  55. 4 2
      src/core/perfmodel/perfmodel.h
  56. 16 16
      src/core/perfmodel/perfmodel_bus.c
  57. 1 1
      src/core/simgrid.h
  58. 6 3
      src/core/topology.h
  59. 6 6
      src/core/workers.h
  60. 3 1
      src/datawizard/coherency.h
  61. 8 4
      src/datawizard/copy_driver.h
  62. 2 1
      src/datawizard/data_request.h
  63. 9 0
      src/datawizard/filters.c
  64. 3 0
      src/datawizard/interfaces/data_interface.c
  65. 2 2
      src/datawizard/memalloc.h
  66. 3 3
      src/datawizard/memory_nodes.h
  67. 1 1
      src/datawizard/memstats.h
  68. 28 0
      src/datawizard/user_interactions.c
  69. 1 1
      src/drivers/mic/driver_mic_source.h
  70. 2 2
      src/drivers/mp_common/mp_common.h
  71. 30 25
      src/sched_policies/component_work_stealing.c
  72. 1 1
      src/sched_policies/helper_mct.h
  73. 1 1
      src/sched_policies/prio_deque.h
  74. 1 1
      src/sched_policies/sched_component.h
  75. 15 10
      src/sched_policies/work_stealing_policy.c
  76. 5 5
      src/util/starpu_clusters_create.h
  77. 2 1
      starpurm/examples/Makefile.am
  78. 29 26
      starpurm/src/starpurm_private.h
  79. 6 4
      tests/Makefile.am
  80. 45 0
      tests/datawizard/partition_wontuse.c
  81. 3 0
      tests/datawizard/temporary_partition.c
  82. 3 0
      tests/datawizard/temporary_partition_implicit.c
  83. 1 0
      tests/microbenchs/bandwidth.c
  84. 2 2
      tests/microbenchs/bandwidth_scheds.sh
  85. 1 0
      tools/Makefile.am
  86. 23 0
      tools/dev/valgrind/glpk.suppr

+ 38 - 51
configure.ac

@@ -151,8 +151,18 @@ AC_ARG_WITH(simgrid-lib-dir,
 	], [simgrid_lib_dir=no])
 
 if test x$enable_simgrid = xyes ; then
-	PKG_CHECK_MODULES([SIMGRID], [simgrid])
+	PKG_CHECK_MODULES([SIMGRID], [simgrid], [], [:])
 
+	if test "$simgrid_include_dir" != "no" ; then
+	   	SIMGRID_CFLAGS="$SIMGRID_CFLAGS -I$simgrid_include_dir"
+	fi
+	if test "$simgrid_lib_dir" != "no" ; then
+	   	SIMGRID_LIBS="$SIMGRID_LIBS -L$simgrid_lib_dir"
+	fi
+	if test "$simgrid_dir" != "no" ; then
+	   	SIMGRID_CFLAGS="$SIMGRID_CFLAGS -I$simgrid_dir/include"
+	   	SIMGRID_LIBS="$SIMGRID_LIBS -L$simgrid_dir/lib"
+	fi
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
@@ -161,20 +171,6 @@ if test x$enable_simgrid = xyes ; then
 	if test -n "$SIMGRID_LIBS" ; then
 		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
 	fi
-	if test "$simgrid_dir" != "no" ; then
-	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
-	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
-	   	NVCCFLAGS="-I$simgrid_dir/include $NVCCFLAGS"
-	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
-	fi
-	if test "$simgrid_include_dir" != "no" ; then
-	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
-	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
-	   	NVCCFLAGS="-I$simgrid_include_dir $NVCCFLAGS"
-	fi
-	if test "$simgrid_lib_dir" != "no" ; then
-	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
-	fi
 	AC_HAVE_LIBRARY([simgrid], [],
 		[
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
@@ -232,9 +228,9 @@ if test x$enable_simgrid = xyes ; then
 		LIBS="$LIBS -lstdc++"
 	fi
 
-	case \ $CXXFLAGS\  in 
+	case \ $CXXFLAGS\  in
 	*\ -std=*\ *) ;;
-	*) 
+	*)
 		AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
 				  #ifdef STARPU_HAVE_SIMGRID_MSG_H
 				  #include <simgrid/msg.h>
@@ -771,7 +767,10 @@ if test x"$enable_native_winthreads" = xyes ; then
 		AC_DEFINE(STARPU_NATIVE_WINTHREADS,[1],[Using native windows threads]),
 		AC_MSG_ERROR([pthread_create unavailable]))
 else
-    AC_CHECK_LIB([pthread], [pthread_create])
+    AC_CHECK_LIB([pthread], [pthread_create], [
+        LIBS="$LIBS -lpthread"
+        STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lpthread"
+    ])
 fi
 
 AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
@@ -1332,9 +1331,9 @@ if test x$enable_cuda = xyes; then
 	STARPU_CUFFT_LDFLAGS="-lcufft"
 
 	AC_LANG_PUSH([C++])
-	case \ $NVCCFLAGS\  in 
+	case \ $NVCCFLAGS\  in
 	*\ -std=*\ *) ;;
-	*) 
+	*)
 		SAVED_CXX="$CXX"
 		CXX="$NVCC"
 		AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
@@ -1357,7 +1356,7 @@ if test x$enable_cuda = xyes; then
 
 	#in case this is a 64bit setup, we tell nvcc to use a -m64 flag, if missing from existing flags
 	if test x$SIZEOF_VOID_P = x8; then
-		case \ $NVCCFLAGS\  in 
+		case \ $NVCCFLAGS\  in
 			*\ -m64\ *) ;;
 			*) NVCCFLAGS="${NVCCFLAGS} -m64" ;;
 		esac
@@ -2739,7 +2738,7 @@ if test "x$use_mpi_master_slave" = "xyes" ; then
       AC_MSG_ERROR([MPI Master-Slave and SOCL can not be used at the same time !])
    fi
    if test "x$enable_socl" = "xmaybe" ; then
-     enable_socl=no 
+     enable_socl=no
    fi
 fi
 
@@ -3031,15 +3030,9 @@ AC_SUBST(BLAS_LIB,$blas_lib)
 #			 Multiple linear regression			      #
 #                                                                             #
 ###############################################################################
-if test x$enable_simgrid = xyes ; then
-	# There is no need for building mlr models in simgrid mode
-	default_enable_mlr=no
-else
-	default_enable_mlr=yes
-fi
-AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
-			[Disable multiple linear regression models])],
-			enable_mlr=$enableval, enable_mlr=$default_enable_mlr)
+AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--enable-mlr],
+			[Enable multiple linear regression models])],
+			enable_mlr=$enableval, enable_mlr=no)
 AC_ARG_ENABLE(mlr-system-blas, [AS_HELP_STRING([--enable-mlr-system-blas],
 			[Make the multiple linear regression models use the system BLAS instead of min-dgels])],
 			enable_mlr_blas=$enableval, enable_mlr_blas=no)
@@ -3053,11 +3046,11 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 	if test x$blas_lib = xnone ; then
 	   use_system_lapack=no
 	fi
-	if test x$enable_mlr_blas = xyes -a test x$use_system_lapack = xyes; then
+	if test x$enable_mlr_blas = xyes -a x$use_system_lapack = xyes; then
 	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
 		LDFLAGS="-llapack $LDFLAGS"
 	else
-		if test x$enable_mlr_blas=xyes -a test x$blas_lib = xmkl; then
+		if test x$enable_mlr_blas=xyes -a x$blas_lib = xmkl; then
 		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])
 		else
 			AC_MSG_CHECKING(whether min-dgels is linked)
@@ -3074,10 +3067,6 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 					install_min_dgels=no
 					support_mlr=no
 				else
-					if test ! -d $PWD/min-dgels; then
-						cp -r $srcdir/min-dgels $PWD/
-						chmod +rwX -R $PWD/min-dgels
-					fi
 					AC_MSG_RESULT(yes)
 					DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/minlibblas.a $STARPU_BUILD_DIR/min-dgels/build/minlibdgels.a $STARPU_BUILD_DIR/min-dgels/build/minlibf2c.a -Wl,--end-group"
 					AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
@@ -3315,9 +3304,9 @@ then
 			[AS_HELP_STRING([--with-dlb-include-dir=<path>],
 			[specify where DLB headers are installed])],
 			[dlb_inc_dirs="$withval"], [dlb_inc_dirs=""])
-	
+
 		dlb_inc_dirs="${dlb_inc_dirs} /usr/include/dlb"
-	
+
 		dlb_incdir_found=no
 		for dlb_incdir in $dlb_inc_dirs
 		do
@@ -3338,18 +3327,18 @@ then
 				unset ac_cv_header_dlb_h
 			fi
 		done
-	
+
 		AC_ARG_WITH(dlb-lib-dir,
 			[AS_HELP_STRING([--with-dlb-lib-dir=<path>],
 			[specify where DLB libraries are installed])],
 			[dlb_lib_dirs="$withval"], [dlb_lib_dirs=""])
-	
+
 		dlb_lib_dirs="${dlb_lib_dirs} /usr/lib/dlb"
-	
+
 		dlb_libdir_found=no
 		for dlb_libdir in $dlb_lib_dirs
 		do
-			if test -n "$dlb_libdir" 
+			if test -n "$dlb_libdir"
 			then
 				SAVED_LDFLAGS="${LDFLAGS}"
 				LDFLAGS=-L${dlb_libdir}
@@ -3366,7 +3355,7 @@ then
 				unset ac_cv_lib_dlb_DLB_Init
 			fi
 		done
-	
+
 		SAVED_CPPFLAGS="${CPPFLAGS}"
 		SAVED_CFLAGS="${CFLAGS}"
 		SAVED_LDFLAGS="${LDFLAGS}"
@@ -3378,7 +3367,7 @@ then
 		CPPFLAGS="$SAVED_CPPFLAGS"
 		CFLAGS="$SAVED_CFLAGS"
 		LIBS="$SAVED_LIBS"
-	
+
 		SAVED_CPPFLAGS="${CPPFLAGS}"
 		SAVED_CFLAGS="${CFLAGS}"
 		SAVED_LDFLAGS="${LDFLAGS}"
@@ -3394,13 +3383,13 @@ then
 		CPPFLAGS="$SAVED_CPPFLAGS"
 		CFLAGS="$SAVED_CFLAGS"
 		LIBS="$SAVED_LIBS"
-	
+
 		if test "x$dlb_incdir_found" != "xyes" -o "x$dlb_libdir_found" != "xyes"
 		then
 			enable_dlb=no
 		fi
 	fi
-	
+
 	AC_MSG_CHECKING(whether DLB support should be enabled)
 	AC_MSG_RESULT($enable_dlb)
 	if test "x$enable_dlb" != "xno"
@@ -3430,8 +3419,6 @@ AC_ARG_ENABLE(starpurm-examples, [AS_HELP_STRING([--enable-starpurm-examples],
 			enable_starpurm_examples=$enableval, enable_starpurm_examples=no)
 AM_CONDITIONAL(STARPU_BUILD_STARPURM_EXAMPLES, [test x$enable_starpurm_examples = xyes])
 
-
-
 ##########################################
 # Documentation                          #
 ##########################################
@@ -3523,7 +3510,7 @@ if test "x$enable_shared" = xno; then
         # No .so, so application will unexpected have to know which -l to
         # use. Give them in .pc file.
 	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
-	STARPU_EXPORTED_LIBS="$LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
+	STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
 fi
 AC_SUBST(STARPU_EXPORTED_LIBS)
 
@@ -3732,7 +3719,7 @@ AC_MSG_NOTICE([
 	       MPI test suite:                                $running_mpi_check
 	       Master-Slave MPI enabled:                      $use_mpi_master_slave
 	       FFT Support:                                   $fft_support
-	       Resource Management enable:                    $starpurm_support
+	       Resource Management enabled:                   $starpurm_support
 	       OpenMP runtime support enabled:                $enable_openmp
 	       Cluster support enabled:                       $enable_cluster
 	       SOCL enabled:                                  $build_socl

+ 4 - 3
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -366,7 +366,8 @@ are computed automatically by the StarPU at the end of the execution, using leas
 squares method of the <c>dgels_</c> LAPACK function.
 
 <c>examples/mlr/mlr.c</c> example provides more details on
-the usage of ::STARPU_MULTIPLE_REGRESSION_BASED models.
+the usage of ::STARPU_MULTIPLE_REGRESSION_BASED models. The \ref enable-mlr
+"--enable-mlr" configure option needs to be set to calibrate the model.
 
 Coefficients computation is done at the end of the execution, and the
 results are stored in standard codelet perfmodel files. Additional
@@ -379,8 +380,8 @@ executions. By default StarPU uses a lightweight dgels implementation, but the
 \ref enable-mlr-system-blas "--enable-mlr-system-blas" configure option can be
 used to make StarPU use a system-provided dgels BLAS.
 
-Additionally, when multiple linear regression models are
-disabled (using \ref disable-mlr "--disable-mlr" configure option) or when the
+Additionally, when multiple linear regression models are not enabled through 
+\ref enable-mlr "--enable-mlr" or when the
 <c>model->combinations</c> are not defined, StarPU will still write
 output files into <c>.starpu/sampling/codelets/tmp/</c> to allow
 performing an analysis. This analysis typically aims at finding the

+ 4 - 4
doc/doxygen/chapters/510_configure_options.doxy

@@ -761,11 +761,11 @@ of this parameter must be in [0..100]. The default value of
 this parameter is 10. Experimental.
 </dd>
 
-<dt>--disable-mlr</dt>
+<dt>--enable-mlr</dt>
 <dd>
-\anchor disable-mlr
-\addindex __configure__--disable-mlr
-Allow to disable multiple linear regression models (see \ref PerformanceModelExample)
+\anchor enable-mlr
+\addindex __configure__--enable-mlr
+Allow to enable multiple linear regression models (see \ref PerformanceModelExample)
 </dd>
 
 <dt>--enable-mlr-system-blas</dt>

+ 65 - 0
doc/doxygen_dev/dev/starpu_check_missing.sh

@@ -0,0 +1,65 @@
+#!/bin/bash
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+stcolor=$(tput sgr0)
+redcolor=$(tput setaf 1)
+greencolor=$(tput setaf 2)
+
+dirname=$(realpath $(dirname $0))
+
+ok()
+{
+    type=$1
+    name=$2
+    echo "$type ${greencolor}${name}${stcolor} is in doxygen-config.cfg.in"
+}
+
+ko()
+{
+    type=$1
+    name=$2
+    #echo "$type ${redcolor}${name}${stcolor} is missing from doxygen-config.cfg.in"
+    echo $name
+}
+
+for d in src mpi/src starpurm/src
+do
+    cd $dirname/../../../$d
+    for f in $(find -name "*.h")
+    do
+	ff=$(echo $f | cut -b3-)
+	x=$(grep -c $ff $dirname/../doxygen-config.cfg.in)
+	if test "$x" == "0"
+	then
+	    ko file $d/$ff
+	#else
+	#    ok file $d/$ff
+	fi
+    done
+done
+
+cd $dirname/../../../build/doc/doxygen_dev/latex
+for f in $(find -name "*8h.tex")
+do
+    ff=$(basename $(echo $f | cut -b3-) ".tex")
+    x=$(grep -c $ff refman.tex)
+    if test "$x" == "0"
+    then
+	ko file $ff
+    fi
+done
+

+ 26 - 1
doc/doxygen_dev/doxygen-config.cfg.in

@@ -99,7 +99,32 @@ INPUT                  = @top_srcdir@/doc/doxygen_dev/chapters         \
 			 @top_srcdir@/src/core/errorcheck.h \
 			 @top_srcdir@/src/core/progress_hook.h \
 			 @top_srcdir@/src/core/drivers.h \
-			 @top_srcdir@/src/core/workers.h
+			 @top_srcdir@/src/core/workers.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_init.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_datatype.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_task_insert.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_select_node.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_fxt.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_cache.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_mpi_backend.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_driver.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_early_data.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_comm.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_tag.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_mpi.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_early_request.h \
+			 @top_srcdir@/mpi/src/mpi/starpu_mpi_sync_data.h \
+			 @top_srcdir@/mpi/src/load_balancer/policy/load_data_interface.h \
+			 @top_srcdir@/mpi/src/load_balancer/policy/load_balancer_policy.h \
+			 @top_srcdir@/mpi/src/load_balancer/policy/data_movements_interface.h \
+			 @top_srcdir@/mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.h \
+			 @top_srcdir@/mpi/src/nmad/starpu_mpi_nmad_backend.h \
+			 @top_srcdir@/mpi/src/nmad/starpu_mpi_nmad.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_stats.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_private.h \
+			 @top_srcdir@/mpi/src/starpu_mpi_cache_stats.h \
+			 @top_srcdir@/starpurm/src/starpurm_private.h
+
 
 EXAMPLE_PATH           = @top_srcdir@/doc/doxygen_dev \
 		       	 @top_srcdir@/doc/doxygen/chapters

+ 2 - 0
doc/doxygen_dev/doxygen.cfg

@@ -1621,6 +1621,8 @@ PREDEFINED             = STARPU_USE_OPENCL=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \
 			 STARPU_SIMGRID=1 \
 			 STARPU_OPENMP=1 \
+			 STARPU_USE_MPI_MPI=1 \
+			 STARPU_USE_MPI_NMAD=1 \
                          __GCC__
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then

+ 30 - 1
doc/doxygen_dev/refman.tex

@@ -76,7 +76,7 @@ Documentation License”.
 \chapter{File Index}
 \input{files}
 
-\chapter{File Documentation}
+\chapter{StarPU File Documentation}
 \input{barrier_8h}
 \input{barrier__counter_8h}
 \input{bound_8h}
@@ -158,6 +158,35 @@ Documentation License”.
 \input{timing_8h}
 \input{topology_8h}
 \input{utils_8h}
+\input{uthash_8h}
 \input{write__back_8h}
 
+\chapter{StarPU MPI File Documentation}
+\input{starpu__mpi__cache_8h}
+\input{starpu__mpi__driver_8h}
+\input{starpu__mpi__init_8h}
+\input{starpu__mpi__nmad__backend_8h}
+\input{starpu__mpi__stats_8h}
+\input{starpu__mpi__cache__stats_8h}
+\input{starpu__mpi__early__data_8h}
+\input{starpu__mpi__mpi_8h}
+\input{starpu__mpi__nmad__unknown__datatype_8h}
+\input{starpu__mpi__sync__data_8h}
+\input{starpu__mpi__comm_8h}
+\input{starpu__mpi__early__request_8h}
+\input{starpu__mpi__mpi__backend_8h}
+\input{starpu__mpi__private_8h}
+\input{starpu__mpi__tag_8h}
+\input{starpu__mpi__datatype_8h}
+\input{starpu__mpi__fxt_8h}
+\input{starpu__mpi__nmad_8h}
+\input{starpu__mpi__select__node_8h}
+\input{starpu__mpi__task__insert_8h}
+\input{load__balancer__policy_8h}
+\input{load__data__interface_8h}
+\input{data__movements__interface_8h}
+
+\chapter{StarPU Resource Manager File Documentation}
+\input{starpurm__private_8h}
+
 \end{document}

+ 8 - 5
examples/cholesky/cholesky_implicit.c

@@ -206,6 +206,14 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 {
 	float *mat = NULL;
 
+	/*
+	 * create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 *
+	 * and make it better conditioned by adding one on the diagonal.
+	 */
+
 #ifndef STARPU_SIMGRID
 	unsigned m,n;
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
@@ -324,11 +332,6 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
 int main(int argc, char **argv)
 {
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
 #ifdef STARPU_HAVE_MAGMA
 	magma_init();
 #endif

+ 3 - 2
min-dgels/Makefile.in

@@ -7,10 +7,10 @@ ADDITIONAL=additional
 
 all:
 	mkdir -p build
-	[ -d "$(CLAPACK)" ] || cp -a $(srcdir)/$(CLAPACK) .
+	[ -d "$(CLAPACK)" ] || ( cp -a $(srcdir)/$(CLAPACK) . ; chmod -R +rwX $(CLAPACK) )
 	cd $(CLAPACK) && $(MAKE) blaslib CC="$(CC)" LD="$(LD)"
 	cd $(CLAPACK) && $(MAKE) f2clib CC="$(CC)" LD="$(LD)"
-	[ -d "$(ADDITIONAL)" ] || cp -a $(srcdir)/$(ADDITIONAL) .
+	[ -d "$(ADDITIONAL)" ] || ( cp -a $(srcdir)/$(ADDITIONAL) . ; chmod -R +rwX $(ADDITIONAL) )
 	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
 
 install:
@@ -25,6 +25,7 @@ clean:
 	rm -rf build *~
 
 distclean: clean
+	[ -f Makefile.in ] || rm -fr $(CLAPACK) $(ADDITIONAL)
 
 # This part is needed by StarPU
 

+ 8 - 0
min-dgels/base/F2CLIBS/libf2c/Makefile

@@ -175,6 +175,14 @@ xwsne.o:	fio.h
 xwsne.o:	lio.h
 xwsne.o:	fmt.h
 
+main.o:		signal1.h
+signal_.o:	signal1.h
+s_paus.o:	signal1.h
+
+err.o:		sysdep1.h
+fio.h:		sysdep1.h
+util.c:		sysdep1.h
+
 arith.h: arithchk.c
 	$(CC) $(CFLAGS) -DNO_FPINIT arithchk.c -lm ||\
 	 $(CC) -DNO_LONG_LONG $(CFLAGS) -DNO_FPINIT arithchk.c -lm

+ 0 - 1
mpi/examples/Makefile.am

@@ -466,7 +466,6 @@ benchs_sendrecv_bench_SOURCES += benchs/abstract_sendrecv_bench.c
 
 benchs_sendrecv_parallel_tasks_bench_SOURCES = benchs/sendrecv_parallel_tasks_bench.c
 benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/bench_helper.c
-benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/abstract_sendrecv_bench.c
 
 benchs_burst_SOURCES = benchs/burst.c
 benchs_burst_SOURCES += benchs/burst_helper.c

+ 35 - 11
mpi/examples/benchs/abstract_sendrecv_bench.c

@@ -19,9 +19,12 @@
 
 
 
-void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier, int bidir)
 {
 	uint64_t iterations = LOOPS_DEFAULT;
+	uint64_t s = 0;
+	uint64_t j = 0;
+	uint64_t k = 0;
 
 	if (mpi_rank >= 2)
 	{
@@ -31,13 +34,13 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
 		}
 
-		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+		for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 		{
 			iterations = bench_nb_iterations(iterations, s);
 
 			starpu_mpi_barrier(MPI_COMM_WORLD);
 
-			for (uint64_t j = 0; j < iterations; j++)
+			for (j = 0; j < iterations; j++)
 			{
 				starpu_mpi_barrier(MPI_COMM_WORLD);
 			}
@@ -59,6 +62,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 	float* vector_recv = NULL;
 	double t1, t2, global_tstart, global_tend;
 	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
+	starpu_mpi_req send_req, recv_req;
 
 	if (thread_barrier != NULL)
 	{
@@ -66,7 +70,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 	}
 
 	global_tstart = starpu_timing_now();
-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 	{
 		vector_send = malloc(s);
 		vector_recv = malloc(s);
@@ -80,23 +84,43 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
 		starpu_mpi_barrier(MPI_COMM_WORLD);
 
-		for (uint64_t j = 0; j < iterations; j++)
+		for (j = 0; j < iterations; j++)
 		{
 			if (mpi_rank == 0)
 			{
 				t1 = starpu_timing_now();
-				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
-				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+				if (bidir)
+				{
+					starpu_mpi_isend(handle_send, &send_req, 1, 0, MPI_COMM_WORLD);
+					starpu_mpi_irecv(handle_recv, &recv_req, 1, 1, MPI_COMM_WORLD);
+					starpu_mpi_wait(&send_req, MPI_STATUS_IGNORE);
+					starpu_mpi_wait(&recv_req, MPI_STATUS_IGNORE);
+				}
+				else
+				{
+					starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+					starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+				}
 				t2 = starpu_timing_now();
 
-				const double t = (t2 -t1) / 2;
+				const double t = (t2 - t1) / 2;
 
 				lats[j] = t;
 			}
 			else
 			{
-				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
-				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+				if (bidir)
+				{
+					starpu_mpi_irecv(handle_recv, &recv_req, 0, 0, MPI_COMM_WORLD);
+					starpu_mpi_isend(handle_send, &send_req, 0, 1, MPI_COMM_WORLD);
+					starpu_mpi_wait(&recv_req, MPI_STATUS_IGNORE);
+					starpu_mpi_wait(&send_req, MPI_STATUS_IGNORE);
+				}
+				else
+				{
+					starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+					starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+				}
 			}
 
 			starpu_mpi_barrier(MPI_COMM_WORLD);
@@ -113,7 +137,7 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 			const double d9_lat = lats[9 * (iterations - 1) / 10];
 			double avg_lat = 0.0;
 
-			for(uint64_t k = 0; k < iterations; k++)
+			for(k = 0; k < iterations; k++)
 			{
 				avg_lat += lats[k];
 			}

+ 1 - 1
mpi/examples/benchs/abstract_sendrecv_bench.h

@@ -17,4 +17,4 @@
 #include <starpu.h>
 
 
-void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier);
+void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier, int bidir);

+ 17 - 12
mpi/examples/benchs/burst_helper.c

@@ -45,7 +45,8 @@ void burst_init_data(int rank)
 		recv_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
 		send_reqs = malloc(burst_nb_requests * sizeof(starpu_mpi_req));
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		int i = 0;
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			send_buffers[i] = malloc(NX_ARRAY * sizeof(float));
 			memset(send_buffers[i], 0, NX_ARRAY * sizeof(float));
@@ -62,7 +63,8 @@ void burst_free_data(int rank)
 {
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		int i = 0;
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			starpu_data_unregister(send_handles[i]);
 			free(send_buffers[i]);
@@ -84,12 +86,13 @@ void burst_free_data(int rank)
 void burst_bidir(int rank)
 {
 	int other_rank = (rank == 0) ? 1 : 0;
+	int i = 0;
 
 	FPRINTF(stderr, "Simultaneous....start (rank %d)\n", rank);
 
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
@@ -100,13 +103,13 @@ void burst_bidir(int rank)
 
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 		}
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
@@ -120,10 +123,11 @@ void burst_bidir(int rank)
 void burst_unidir(int sender, int receiver, int rank)
 {
 	FPRINTF(stderr, "%d -> %d... start (rank %d)\n", sender, receiver, rank);
+	int i = 0;
 
 	if (rank == receiver)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], sender, i, MPI_COMM_WORLD);
@@ -134,7 +138,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
 	if (rank == sender)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], receiver, i, i, MPI_COMM_WORLD);
@@ -143,7 +147,7 @@ void burst_unidir(int sender, int receiver, int rank)
 
 	if (rank == sender || rank == receiver)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			if (rank != sender && recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (rank == sender && send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);
@@ -160,12 +164,13 @@ void burst_bidir_half_postponed(int rank)
 {
 	int other_rank = (rank == 0) ? 1 : 0;
 	int received = 0;
+	int i = 0;
 
 	FPRINTF(stderr, "Half/half burst...start (rank %d)\n", rank);
 
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			recv_reqs[i] = NULL;
 			starpu_mpi_irecv(recv_handles[i], &recv_reqs[i], other_rank, i, MPI_COMM_WORLD);
@@ -176,7 +181,7 @@ void burst_bidir_half_postponed(int rank)
 
 	if (rank == 0 || rank == 1)
 	{
-		for (int i = 0; i < (burst_nb_requests / 2); i++)
+		for (i = 0; i < (burst_nb_requests / 2); i++)
 		{
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
@@ -184,13 +189,13 @@ void burst_bidir_half_postponed(int rank)
 
 		if (recv_reqs[burst_nb_requests / 4]) starpu_mpi_wait(&recv_reqs[burst_nb_requests / 4], MPI_STATUS_IGNORE);
 
-		for (int i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
+		for (i = (burst_nb_requests / 2); i < burst_nb_requests; i++)
 		{
 			send_reqs[i] = NULL;
 			starpu_mpi_isend_prio(send_handles[i], &send_reqs[i], other_rank, i, i, MPI_COMM_WORLD);
 		}
 
-		for (int i = 0; i < burst_nb_requests; i++)
+		for (i = 0; i < burst_nb_requests; i++)
 		{
 			if (recv_reqs[i]) starpu_mpi_wait(&recv_reqs[i], MPI_STATUS_IGNORE);
 			if (send_reqs[i]) starpu_mpi_wait(&send_reqs[i], MPI_STATUS_IGNORE);

+ 10 - 5
mpi/examples/benchs/gemm_helper.c

@@ -98,8 +98,9 @@ static void cpu_init_matrix_random(void *descr[], void *arg)
 	TYPE *subB = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned i = 0;
 
-	for (unsigned i = 0; i < nx *ny; i++)
+	for (i = 0; i < nx *ny; i++)
 	{
 		subA[i] = (TYPE) (starpu_drand48());
 		subB[i] = (TYPE) (starpu_drand48());
@@ -113,8 +114,9 @@ static void cpu_init_matrix_zero(void *descr[], void *arg)
 	TYPE *subA = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
 	unsigned nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned i = 0;
 
-	for (unsigned i = 0; i < nx *ny; i++)
+	for (i = 0; i < nx *ny; i++)
 	{
 		subA[i] = (TYPE) (0);
 	}
@@ -290,18 +292,21 @@ void gemm_add_polling_dependencies()
 {
 	starpu_tag_t nb_tasks = (starpu_tag_t) nslices * (starpu_tag_t) nslices;
 	unsigned nb_workers = starpu_worker_get_count();
+	starpu_tag_t synchro_tag = 0;
+	starpu_tag_t previous_tag = 0;
+	starpu_tag_t next_tag = 0;
 
-	for (starpu_tag_t synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
+	for (synchro_tag = nb_workers+1; synchro_tag <= nb_tasks; synchro_tag += (nb_workers+1))
 	{
 		// this synchro tag depends on tasks of previous column of tasks:
-		for (starpu_tag_t previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
+		for (previous_tag = synchro_tag - nb_workers; previous_tag < synchro_tag; previous_tag++)
 		{
 			starpu_tag_declare_deps(synchro_tag, 1, previous_tag);
 		}
 
 		// tasks of the next column of tasks depend on this synchro tag:
 		// this actually allows workers to poll for new tasks, while no task is available
-		for (starpu_tag_t next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
+		for (next_tag = synchro_tag+1; next_tag < (synchro_tag + nb_workers + 1) && next_tag <= nb_tasks; next_tag++)
 		{
 			starpu_tag_declare_deps(next_tag, 1, synchro_tag);
 		}

+ 23 - 7
mpi/examples/benchs/sendrecv_bench.c

@@ -16,6 +16,8 @@
 /*
  * Basic send receive benchmark.
  * Inspired a lot from NewMadeleine examples/benchmarks/nm_bench_sendrecv.c
+ *
+ * The option --bidir is available to do full-duplex communications.
  */
 
 #include <starpu_mpi.h>
@@ -23,13 +25,25 @@
 #include "abstract_sendrecv_bench.h"
 
 
+static inline void man()
+{
+	fprintf(stderr, "Options:\n");
+	fprintf(stderr, "\t-h --help   display this help\n");
+	fprintf(stderr, "\t-p          pause workers during benchmark\n");
+	fprintf(stderr, "\t--bidir     full-duplex communications\n");
+	exit(EXIT_SUCCESS);
+}
+
+
 int main(int argc, char **argv)
 {
 	int ret, rank, worldsize;
 	int pause_workers = 0;
+	int i = 0;
+	int bidir = 0;
 
 
-	for (int i = 1; i < argc; i++)
+	for (i = 1; i < argc; i++)
 	{
 		if (strcmp(argv[i], "-p") == 0)
 		{
@@ -38,15 +52,17 @@ int main(int argc, char **argv)
 		}
 		else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
-			fprintf(stderr, "Options:\n");
-			fprintf(stderr, "\t-h --help   display this help\n");
-			fprintf(stderr, "\t-p          pause workers during benchmark\n");
-			exit(EXIT_SUCCESS);
+			man();
+		}
+		if (strcmp(argv[i], "--bidir") == 0)
+		{
+			bidir = 1;
+			printf("Communications will be full-duplex.\n");
 		}
 		else
 		{
 			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
-			exit(EXIT_FAILURE);
+			man();
 		}
 	}
 
@@ -74,7 +90,7 @@ int main(int argc, char **argv)
 		starpu_pause();
 	}
 
-	sendrecv_bench(rank, NULL);
+	sendrecv_bench(rank, NULL, bidir);
 
 	if (pause_workers)
 	{

+ 1 - 1
mpi/examples/benchs/sendrecv_gemm_bench.c

@@ -56,7 +56,7 @@ static void* comm_thread_func(void* arg)
 		fprintf(stderr, "[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
 	}
 
-	sendrecv_bench(mpi_rank, &thread_barrier);
+	sendrecv_bench(mpi_rank, &thread_barrier, /* half-duplex communications */ 0);
 
 	return NULL;
 }

+ 10 - 7
mpi/examples/benchs/sendrecv_parallel_tasks_bench.c

@@ -34,7 +34,6 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 #include "bench_helper.h"
-#include "abstract_sendrecv_bench.h"
 
 #define NB_WARMUP_PINGPONGS 10
 
@@ -56,6 +55,8 @@ void cpu_task(void* descr[], void* args)
 	double t1, t2;
 	int asked_worker;
 	int current_worker = starpu_worker_get_id();
+	uint64_t j = 0;
+	uint64_t k = 0;
 
 	starpu_codelet_unpack_args(args, &mpi_rank, &asked_worker, &s, &handle_send, &handle_recv);
 
@@ -64,7 +65,7 @@ void cpu_task(void* descr[], void* args)
 	iterations = bench_nb_iterations(iterations, s);
 	double* lats = malloc(sizeof(double) * iterations);
 
-	for (uint64_t j = 0; j < NB_WARMUP_PINGPONGS; j++)
+	for (j = 0; j < NB_WARMUP_PINGPONGS; j++)
 	{
 		if (mpi_rank == 0)
 		{
@@ -78,7 +79,7 @@ void cpu_task(void* descr[], void* args)
 		}
 	}
 
-	for (uint64_t j = 0; j < iterations; j++)
+	for (j = 0; j < iterations; j++)
 	{
 		if (mpi_rank == 0)
 		{
@@ -107,7 +108,7 @@ void cpu_task(void* descr[], void* args)
 		const double d9_lat = lats[9 * (iterations - 1) / 10];
 		double avg_lat = 0.0;
 
-		for(uint64_t k = 0; k < iterations; k++)
+		for(k = 0; k < iterations; k++)
 		{
 			avg_lat += lats[k];
 		}
@@ -167,6 +168,8 @@ int main(int argc, char **argv)
 	unsigned cpu_count = starpu_cpu_worker_get_count();
 	unsigned* mpi_tags = malloc(cpu_count * sizeof(unsigned));
 	unsigned tag = 0;
+	uint64_t s = 0;
+	unsigned i = 0;
 
 	int* workers = malloc(cpu_count * sizeof(int));
 	float** vectors_send = malloc(cpu_count * sizeof(float*));
@@ -174,11 +177,11 @@ int main(int argc, char **argv)
 	starpu_data_handle_t* handles_send = malloc(cpu_count * sizeof(starpu_data_handle_t));
 	starpu_data_handle_t* handles_recv = malloc(cpu_count * sizeof(starpu_data_handle_t));
 
-	for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
+	for (s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
 	{
 		starpu_pause();
 
-		for (unsigned i = 0; i < cpu_count; i++)
+		for (i = 0; i < cpu_count; i++)
 		{
 			workers[i] = i;
 			vectors_send[i] = malloc(s);
@@ -201,7 +204,7 @@ int main(int argc, char **argv)
 		starpu_resume();
 		starpu_task_wait_for_all();
 
-		for (unsigned i = 0; i < cpu_count; i++)
+		for (i = 0; i < cpu_count; i++)
 		{
 			starpu_data_unregister(handles_send[i]);
 			starpu_data_unregister(handles_recv[i]);

+ 39 - 30
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -198,7 +198,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
 static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
 {
-	unsigned k, m, n, i;
+	unsigned k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned nn = size/nblocks;
 
@@ -209,8 +209,36 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 	{
 		starpu_iteration_push(n);
 
+		/* First handle the diagonal block */
 		/* Row */
-		for (m = n; m<nblocks; m++)
+		m = n;
+
+		for (k = 0; k < n; k++)
+		{
+			/* Accumulate updates from TRSMs */
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[n][k],
+					       STARPU_R, data_handles[m][k],
+					       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+					       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
+					       0);
+
+			/* Nobody else will need it */
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+			starpu_data_wont_use(data_handles[m][k]);
+		}
+
+		k = n;
+		/* Factorize */
+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+				       STARPU_RW, data_handles[k][k],
+				       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
+				       0);
+
+		/* Row */
+		for (m = n + 1; m<nblocks; m++)
 		{
 			for (k = 0; k < n; k++)
 			{
@@ -223,34 +251,15 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 						       STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 						       0);
 
-				if (m == n)
-				{
-					/* Nobody else will need it */
-					starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
-					starpu_data_wont_use(data_handles[m][k]);
-				}
 			}
 			k = n;
-			if (m > n)
-			{
-				/* non-diagonal block, solve */
-				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
-						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-						       STARPU_R, data_handles[k][k],
-						       STARPU_RW, data_handles[m][k],
-						       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
-						       0);
-			}
-			else
-			{
-				/* diagonal block, factorize */
-				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
-						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
-						       STARPU_RW, data_handles[k][k],
-						       STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
-						       0);
-			}
-
+			/* Solve */
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[m][k],
+					       STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
+					       0);
 		}
 
 		/* We won't need it any more */
@@ -268,8 +277,8 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
 /* TODO: generate from compiler polyhedral analysis of classical algorithm */
 static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
 {
-	unsigned a, c;
-	unsigned k, m, n, i;
+	unsigned a;
+	unsigned k, m, n;
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 	unsigned nn = size/nblocks;
 

+ 6 - 4
mpi/src/load_balancer/policy/data_movements_interface.h

@@ -16,17 +16,19 @@
 
 #include <starpu.h>
 
+/** @file */
+
 #ifndef __DATA_MOVEMENTS_INTERFACE_H
 #define __DATA_MOVEMENTS_INTERFACE_H
 
-/* interface for data_movements */
+/** interface for data_movements */
 struct data_movements_interface
 {
-	/* Data tags table */
+	/** Data tags table */
 	int *tags;
-	/* Ranks table (where to move the corresponding data) */
+	/** Ranks table (where to move the corresponding data) */
 	int *ranks;
-	/* Size of the tables */
+	/** Size of the tables */
 	int size;
 };
 

+ 4 - 2
mpi/src/load_balancer/policy/load_balancer_policy.h

@@ -19,12 +19,14 @@
 
 #include <starpu_mpi_lb.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 
-/* A load balancer consists in a collection of operations on a data
+/** A load balancer consists in a collection of operations on a data
  * representing the load of the application (in terms of computation, memory,
  * whatever). StarPU allows several entry points for the user. The load
  * balancer allows the user to give its load balancing methods to be used on
@@ -36,7 +38,7 @@ struct load_balancer_policy
 	void (*submitted_task_entry_point)();
 	void (*finished_task_entry_point)();
 
-	/* Name of the load balancing policy. The selection of the load balancer is
+	/** Name of the load balancing policy. The selection of the load balancer is
 	 * performed through the use of the STARPU_MPI_LB=name environment
 	 * variable.
 	 */

+ 11 - 9
mpi/src/load_balancer/policy/load_data_interface.h

@@ -16,29 +16,31 @@
 
 #include <starpu.h>
 
+/** @file */
+
 #ifndef __LOAD_DATA_INTERFACE_H
 #define __LOAD_DATA_INTERFACE_H
 
-/* interface for load_data */
+/** interface for load_data */
 struct load_data_interface
 {
-	/* Starting time of the execution */
+	/** Starting time of the execution */
 	double start;
-	/* Elapsed time until the start time and the time when event "launch a load
+	/** Elapsed time until the start time and the time when event "launch a load
 	 * balancing phase" is triggered */
 	double elapsed_time;
-	/* Current submission phase, i.e how many balanced steps have already
+	/** Current submission phase, i.e how many balanced steps have already
 	 * happened so far. */
 	int phase;
-	/* Number of currently submitted tasks */
+	/** Number of currently submitted tasks */
 	int nsubmitted_tasks;
-	/* Number of currently finished tasks */
+	/** Number of currently finished tasks */
 	int nfinished_tasks;
-	/* Task threshold to sleep the submission thread */
+	/** Task threshold to sleep the submission thread */
 	int sleep_task_threshold;
-	/* Task threshold to wake-up the submission thread */
+	/** Task threshold to wake-up the submission thread */
 	int wakeup_task_threshold;
-	/* Ratio of submitted tasks to wait for completion before waking up the
+	/** Ratio of submitted tasks to wait for completion before waking up the
 	 * submission thread */
 	double wakeup_ratio;
 };

+ 2 - 0
mpi/src/mpi/starpu_mpi_comm.h

@@ -25,6 +25,8 @@
 
 #include <mpi/starpu_mpi_mpi_backend.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/mpi/starpu_mpi_driver.h

@@ -19,6 +19,8 @@
 
 #include <starpu.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 2 - 0
mpi/src/mpi/starpu_mpi_early_data.h

@@ -25,6 +25,8 @@
 #include <common/uthash.h>
 #include <starpu_mpi_private.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 2 - 0
mpi/src/mpi/starpu_mpi_early_request.h

@@ -23,6 +23,8 @@
 #include <common/config.h>
 #include <common/list.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 2 - 0
mpi/src/mpi/starpu_mpi_mpi.h

@@ -23,6 +23,8 @@
 #include <common/config.h>
 #include <common/list.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 3 - 1
mpi/src/mpi/starpu_mpi_mpi_backend.h

@@ -20,6 +20,8 @@
 #include <common/config.h>
 #include <common/uthash.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -61,7 +63,7 @@ struct _starpu_mpi_req_backend
 	starpu_pthread_cond_t req_cond;
 	starpu_pthread_mutex_t posted_mutex;
 	starpu_pthread_cond_t posted_cond;
-	/* In the case of a Wait/Test request, we are going to post a request
+	/** In the case of a Wait/Test request, we are going to post a request
 	 * to test the completion of another request */
 	struct _starpu_mpi_req *other_request;
 

+ 2 - 0
mpi/src/mpi/starpu_mpi_sync_data.h

@@ -23,6 +23,8 @@
 #include <common/config.h>
 #include <common/list.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 2 - 0
mpi/src/mpi/starpu_mpi_tag.h

@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <mpi.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_MPI
 
 #ifdef __cplusplus

+ 2 - 0
mpi/src/nmad/starpu_mpi_nmad.h

@@ -23,6 +23,8 @@
 #include <common/config.h>
 #include <common/list.h>
 
+/** @file */
+
 #ifdef STARPU_USE_MPI_NMAD
 
 #ifdef __cplusplus

+ 3 - 1
mpi/src/nmad/starpu_mpi_nmad_backend.h

@@ -19,6 +19,8 @@
 
 #include <common/config.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -38,7 +40,7 @@ struct _starpu_mpi_req_backend
 	piom_cond_t req_cond;
 	nm_sr_request_t size_req;
 
-	// When datatype is unknown:
+	/** When datatype is unknown */
 	struct nm_data_s unknown_datatype_body;
 	struct nm_data_s unknown_datatype_data;
 	struct nm_data_s unknown_datatype_size;

+ 2 - 0
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.h

@@ -19,6 +19,8 @@
 
 #include <common/config.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_cache.h

@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_cache_stats.h

@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_datatype.h

@@ -20,6 +20,8 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi_private.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_fxt.h

@@ -22,6 +22,8 @@
 #include <common/config.h>
 #include <common/fxt.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_init.h

@@ -20,6 +20,8 @@
 #include <starpu.h>
 #include <starpu_mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 8 - 3
mpi/src/starpu_mpi_private.h

@@ -27,6 +27,8 @@
 #include <common/starpu_spinlock.h>
 #include <core/simgrid.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -249,7 +251,8 @@ LIST_TYPE(_starpu_mpi_req,
 
 	int ret;
 
-	enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
+	/** 0 send, 1 recv */
+	enum _starpu_mpi_request_type request_type;
 
 	unsigned submitted;
 	unsigned completed;
@@ -318,8 +321,10 @@ struct _starpu_mpi_argc_argv
 	int *argc;
 	char ***argv;
 	MPI_Comm comm;
-	int fargc;	// Fortran argc
-	char **fargv;	// Fortran argv
+	/** Fortran argc */
+	int fargc;
+	/** Fortran argv */
+	char **fargv;
 	int rank;
 	int world_size;
 };

+ 2 - 0
mpi/src/starpu_mpi_select_node.h

@@ -19,6 +19,8 @@
 
 #include <mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_stats.h

@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <mpi.h>
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 2 - 0
mpi/src/starpu_mpi_task_insert.h

@@ -17,6 +17,8 @@
 #ifndef __STARPU_MPI_TASK_INSERT_H__
 #define __STARPU_MPI_TASK_INSERT_H__
 
+/** @file */
+
 #ifdef __cplusplus
 extern "C"
 {

+ 5 - 3
mpi/tests/pingpong.c

@@ -43,13 +43,14 @@ int main(int argc, char **argv)
 {
 	int ret, rank, size;
 	int mpi_init;
+	int i;
 
 	int niter = DEFAULT_NITER;
 	int data_size = DEFAULT_DATA_SIZE;
 	int sleep_time = DEFAULT_SLEEP_TIME;
 	int method = DEFAULT_METHOD;
 
-	for (int i = 1; i < argc; i++)
+	for (i = 1; i < argc; i++)
 	{
 		if (strcmp(argv[i], "-n") == 0)
 		{
@@ -134,6 +135,7 @@ int main(int argc, char **argv)
 	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
 	int sender;
+	int r = 0;
 
 	if (method == 0) // ping pongs
 	{
@@ -161,7 +163,7 @@ int main(int argc, char **argv)
 			sender = loop % size;
 			if (sender == rank)
 			{
-				for (int r = 0; r < size; r++)
+				for (r = 0; r < size; r++)
 				{
 					if (r != rank)
 					{
@@ -175,7 +177,7 @@ int main(int argc, char **argv)
 				MPI_Status status;
 				starpu_mpi_recv(tab_handle, sender, (rank * niter) + loop, MPI_COMM_WORLD, &status);
 
-				for (int r = 0; r < (size-1); r++)
+				for (r = 0; r < (size-1); r++)
 					starpu_sleep(sleep_time / 1000);
 			}
 		}

+ 1 - 1
src/common/fxt.h

@@ -290,7 +290,7 @@ extern int _starpu_fxt_willstart;
 extern starpu_pthread_mutex_t _starpu_fxt_started_mutex;
 extern starpu_pthread_cond_t _starpu_fxt_started_cond;
 
-/* Wait until FXT is started (or not). Returns if FXT was started */
+/** Wait until FXT is started (or not). Returns if FXT was started */
 static inline int _starpu_fxt_wait_initialisation()
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_fxt_started_mutex);

+ 29 - 14
src/common/graph.h

@@ -28,8 +28,10 @@ MULTILIST_CREATE_TYPE(_starpu_graph_node, dropped)
 
 struct _starpu_graph_node
 {
-	starpu_pthread_mutex_t mutex;	/* protects access to the job */
-	struct _starpu_job *job;	/* pointer to the job, if it is still alive, NULL otherwise */
+	/** protects access to the job */
+	starpu_pthread_mutex_t mutex;
+	/** pointer to the job, if it is still alive, NULL otherwise */
+	struct _starpu_job *job;
 
 	/**
 	 * Fields for graph analysis for scheduling heuristics
@@ -44,22 +46,35 @@ struct _starpu_graph_node
 	struct _starpu_graph_node_multilist_dropped dropped;
 
 	/** set of incoming dependencies */
-	struct _starpu_graph_node **incoming;	/* May contain NULLs for terminated jobs */
-	unsigned *incoming_slot;	/* Index within corresponding outgoing array */
-	unsigned n_incoming;		/* Number of slots used */
-	unsigned alloc_incoming;	/* Size of incoming */
+	/** May contain NULLs for terminated jobs */
+	struct _starpu_graph_node **incoming;
+	/** Index within corresponding outgoing array */
+	unsigned *incoming_slot;
+	/** Number of slots used */
+	unsigned n_incoming;
+	/** Size of incoming */
+	unsigned alloc_incoming;
 	/** set of outgoing dependencies */
 	struct _starpu_graph_node **outgoing;
-	unsigned *outgoing_slot;	/* Index within corresponding incoming array */
-	unsigned n_outgoing;		/* Number of slots used */
-	unsigned alloc_outgoing;	/* Size of outgoing */
 
-	unsigned depth;			/* Rank from bottom, in number of jobs */
-					/* Only available if _starpu_graph_compute_depths was called */
-	unsigned descendants;		/* Number of children, grand-children, etc. */
-					/* Only available if _starpu_graph_compute_descendants was called */
+	/** Index within corresponding incoming array */
+	unsigned *outgoing_slot;
+	/** Number of slots used */
+	unsigned n_outgoing;
+	/** Size of outgoing */
+	unsigned alloc_outgoing;
 
-	int graph_n;			/* Variable available for graph flow */
+	/** Rank from bottom, in number of jobs
+	 * Only available if _starpu_graph_compute_depths was called
+	 */
+	unsigned depth;
+	/** Number of children, grand-children, etc.
+	 * Only available if _starpu_graph_compute_descendants was called
+	 */
+	unsigned descendants;
+
+	/** Variable available for graph flow */
+	int graph_n;
 };
 
 MULTILIST_CREATE_INLINES(struct _starpu_graph_node, _starpu_graph_node, all)

+ 4 - 6
src/common/knobs.h

@@ -96,7 +96,7 @@ __STARPU_PERF_COUNTER_UPDATE_64BIT(min,<=,double,double);
 #undef __STARPU_PERF_COUNTER_UPDATE_32BIT
 #undef __STARPU_PERF_COUNTER_UPDATE_64BIT
 
-/* Floating point atomic accumulate */
+/** Floating point atomic accumulate */
 static inline void _starpu_perf_counter_update_acc_float(float *ptr, float acc_value)
 {
 	STARPU_ASSERT(sizeof(float) == sizeof(uint32_t));
@@ -339,14 +339,12 @@ static inline int _starpu_perf_knob_id_build(const enum starpu_perf_knob_scope s
 	return (index << _STARPU_PERF_KNOBS_ID_SCOPE_BITS) | scope;
 }
 
-
 void _starpu_perf_knob_init(void);
 void _starpu_perf_knob_exit(void);
 
-struct starpu_perf_knob_group *_starpu_perf_knob_group_register(
-	enum starpu_perf_knob_scope scope,
-	void (*set_func)(const struct starpu_perf_knob * const knob, void *context, const struct starpu_perf_knob_value * const value),
-	void (*get_func)(const struct starpu_perf_knob * const knob, void *context,       struct starpu_perf_knob_value * const value));
+struct starpu_perf_knob_group *_starpu_perf_knob_group_register(enum starpu_perf_knob_scope scope,
+								void (*set_func)(const struct starpu_perf_knob * const knob, void *context, const struct starpu_perf_knob_value * const value),
+								void (*get_func)(const struct starpu_perf_knob * const knob, void *context, struct starpu_perf_knob_value * const value));
 void _starpu_perf_knob_group_unregister(struct starpu_perf_knob_group *group);
 
 int _starpu_perf_knob_register(struct starpu_perf_knob_group *group, const char *name, enum starpu_perf_knob_type type, const char *help);

+ 2 - 0
src/common/list.h

@@ -18,6 +18,8 @@
 #ifndef __LIST_H__
 #define __LIST_H__
 
+/** @file */
+
 #include <starpu_util.h>
 
 /** @remarks list how-to

+ 4 - 5
src/common/timing.h

@@ -19,11 +19,6 @@
 
 /** @file */
 
-/*
- * _starpu_timing_init must be called prior to using any of these timing
- * functions.
- */
-
 #include <stdint.h>
 #include <common/config.h>
 #ifdef HAVE_UNISTD_H
@@ -32,6 +27,10 @@
 #include <starpu.h>
 #include <starpu_util.h>
 
+/**
+ * _starpu_timing_init must be called prior to using any of these timing
+ * functions.
+ */
 void _starpu_timing_init(void);
 void _starpu_clock_gettime(struct timespec *ts);
 

+ 2 - 0
src/common/uthash.h

@@ -24,6 +24,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef UTHASH_H
 #define UTHASH_H 
 
+/** @file */
+
 #include <string.h>   /* memcmp,strlen */
 #include <stddef.h>   /* ptrdiff_t */
 

+ 1 - 1
src/common/utils.h

@@ -152,7 +152,7 @@ char *_starpu_mkdtemp(char *tmpl);
 int _starpu_mkpath(const char *s, mode_t mode);
 void _starpu_mkpath_and_check(const char *s, mode_t mode);
 char *_starpu_mktemp(const char *directory, int flags, int *fd);
-/* This version creates a hierarchy of n temporary directories, useful when
+/** This version creates a hierarchy of n temporary directories, useful when
  * creating a lot of temporary files to be stored in the same place */
 char *_starpu_mktemp_many(const char *directory, int depth, int flags, int *fd);
 void _starpu_rmtemp_many(char *path, int depth);

+ 5 - 5
src/core/debug.h

@@ -290,20 +290,20 @@
 
 #endif
 
-/* Create a file that will contain StarPU's log */
+/** Create a file that will contain StarPU's log */
 void _starpu_open_debug_logfile(void);
 
-/* Close StarPU's log file */
+/** Close StarPU's log file */
 void _starpu_close_debug_logfile(void);
 
-/* Write into StarPU's log file */
+/** Write into StarPU's log file */
 void _starpu_print_to_logfile(const char *format, ...) STARPU_ATTRIBUTE_FORMAT(printf, 1, 2);
 
-/* Tell gdb whether FXT is compiled in or not */
+/** Tell gdb whether FXT is compiled in or not */
 extern int _starpu_use_fxt;
 
 #if defined(STARPU_USE_AYUDAME1)
-/* Get an Ayudame id for CL */
+/** Get an Ayudame id for CL */
 int64_t _starpu_ayudame_get_func_id(struct starpu_codelet *cl);
 #endif
 

+ 14 - 9
src/core/dependencies/cg.h

@@ -60,7 +60,8 @@ struct _starpu_cg_list
 	/** List of successors */
 	unsigned nsuccs; /* how many successors ? */
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
-	unsigned succ_list_size; /* How many allocated items in succ */
+	/** How many allocated items in succ */
+	unsigned succ_list_size;
 	struct _starpu_cg **succ;
 #else
 	struct _starpu_cg *succ[STARPU_NMAXDEPS];
@@ -77,27 +78,31 @@ enum _starpu_cg_type
 /** Completion Group */
 struct _starpu_cg
 {
-	unsigned ntags; /* number of tags depended on */
-	unsigned remaining; /* number of remaining tags */
+	/** number of tags depended on */
+	unsigned ntags;
+	/** number of remaining tags */
+	unsigned remaining;
 
 #ifdef STARPU_DEBUG
 	unsigned ndeps;
-	void **deps; /* array of predecessors, size ndeps */
-	char *done;  /* which ones have notified, size ndeps */
+	/** array of predecessors, size ndeps */
+	void **deps;
+	/** which ones have notified, size ndeps */
+	char *done;
 #endif
 
 	enum _starpu_cg_type cg_type;
 
 	union
 	{
-		/* STARPU_CG_TAG */
+		/** STARPU_CG_TAG */
 		struct _starpu_tag *tag;
 
-		/* STARPU_CG_TASK */
+		/** STARPU_CG_TASK */
 		struct _starpu_job *job;
 
-		/* STARPU_CG_APPS */
-		/* in case this completion group is related to an application,
+		/** STARPU_CG_APPS
+		 * in case this completion group is related to an application,
 		 * we have to explicitely wake the waiting thread instead of
 		 * reschedule the corresponding task */
 		struct

+ 14 - 12
src/core/dependencies/tags.h

@@ -28,19 +28,18 @@
 
 enum _starpu_tag_state
 {
-	/* this tag is not declared by any task */
+	/** this tag is not declared by any task */
 	STARPU_INVALID_STATE,
-	/* _starpu_tag_declare was called to associate the tag to a task */
+	/** _starpu_tag_declare was called to associate the tag to a task */
 	STARPU_ASSOCIATED,
-	/* some task dependencies are not fulfilled yet */
+	/** some task dependencies are not fulfilled yet */
 	STARPU_BLOCKED,
-	/* the task can be (or has been) submitted to the scheduler (all deps
- 	 * fulfilled) */
+	/** the task can be (or has been) submitted to the scheduler (all deps fulfilled) */
 	STARPU_READY,
 // useless ...
-//	/* the task has been submitted to the scheduler */
+//	/** the task has been submitted to the scheduler */
 //	STARPU_SCHEDULED,
-	/* the task has been performed */
+	/** the task has been performed */
 	STARPU_DONE
 };
 
@@ -48,15 +47,18 @@ struct _starpu_job;
 
 struct _starpu_tag
 {
-	/* Lock for this structure. Locking order is in dependency order: a tag
-	 * must not be locked before locking a tag it depends on */
+	/**
+	   Lock for this structure. Locking order is in dependency order: a tag
+	   * must not be locked before locking a tag it depends on */
 	struct _starpu_spinlock lock;
-	starpu_tag_t id; /* an identifier for the task */
+	/** an identifier for the task */
+	starpu_tag_t id;
 	enum _starpu_tag_state state;
 
 	struct _starpu_cg_list tag_successors;
 
-	struct _starpu_job *job; /* which job is associated to the tag if any ? */
+	/** which job is associated to the tag if any ? */
+	struct _starpu_job *job;
 
 	unsigned is_assigned;
 	unsigned is_submitted;
@@ -69,7 +71,7 @@ void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_
 
 void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
 
-/* lock should be taken, and this releases it */
+/** lock should be taken, and this releases it */
 void _starpu_tag_set_ready(struct _starpu_tag *tag);
 
 unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);

+ 3 - 0
src/core/perfmodel/multiple_regression.c

@@ -236,6 +236,9 @@ int dgels_multiple_reg_coeff(double *mpar, double *my, unsigned long nn, unsigne
 	if( info != 0 )
 	{
 		_STARPU_DISP("Warning: Problems when executing dgels_ function. It seems like the diagonal element %ld is zero.\n Multiple linear regression model will not be written into perfmodel file.\n", info);
+		free(X);
+		free(Y);
+		free(work);
 		return 1;
 	}
 

+ 4 - 2
src/core/perfmodel/perfmodel.h

@@ -50,8 +50,10 @@ struct _starpu_perfmodel_state
 	starpu_pthread_rwlock_t model_rwlock;
 	int *nimpls;
 	int *nimpls_set;
-	int ncombs;  /* The number of combinations currently used by the model */
-	int ncombs_set; /* The number of combinations allocated in the array nimpls and ncombs */
+	/** The number of combinations currently used by the model */
+	int ncombs;
+	/** The number of combinations allocated in the array nimpls and ncombs */
+	int ncombs_set;
 	int *combs;
 };
 

+ 16 - 16
src/core/perfmodel/perfmodel_bus.c

@@ -888,7 +888,7 @@ static void load_bus_affinity_file_content(void)
 	_STARPU_DEBUG("loading affinities from %s\n", path);
 
 	f = fopen(path, "r");
-	STARPU_ASSERT(f);
+	STARPU_ASSERT_MSG(f, "Error when reading from file '%s'", path);
 
 	locked = _starpu_frdlock(f) == 0;
 
@@ -903,7 +903,7 @@ static void load_bus_affinity_file_content(void)
 
 		_starpu_drop_comments(f);
 		ret = fscanf(f, "%u\t", &dummy);
-		STARPU_ASSERT(ret == 1);
+		STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 
 		STARPU_ASSERT(dummy == gpu);
 
@@ -911,11 +911,11 @@ static void load_bus_affinity_file_content(void)
 		for (numa = 0; numa < nnumas; numa++)
 		{
 			ret = fscanf(f, "%u\t", &cuda_affinity_matrix[gpu][numa]);
-			STARPU_ASSERT(ret == 1);
+			STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 		}
 
 		ret = fscanf(f, "\n");
-		STARPU_ASSERT(ret == 0);
+		STARPU_ASSERT_MSG(ret == 0, "Error when reading from file '%s'", path);
 	}
 #endif /* !STARPU_USE_CUDA */
 #ifdef STARPU_USE_OPENCL
@@ -927,7 +927,7 @@ static void load_bus_affinity_file_content(void)
 
 		_starpu_drop_comments(f);
 		ret = fscanf(f, "%u\t", &dummy);
-		STARPU_ASSERT(ret == 1);
+		STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 
 		STARPU_ASSERT(dummy == gpu);
 
@@ -935,11 +935,11 @@ static void load_bus_affinity_file_content(void)
 		for (numa = 0; numa < nnumas; numa++)
 		{
 			ret = fscanf(f, "%u\t", &opencl_affinity_matrix[gpu][numa]);
-			STARPU_ASSERT(ret == 1);
+			STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 		}
 
 		ret = fscanf(f, "\n");
-		STARPU_ASSERT(ret == 0);
+		STARPU_ASSERT_MSG(ret == 0, "Error when reading from file '%s'", path);
 	}
 #endif /* !STARPU_USE_OPENCL */
 	if (locked)
@@ -1059,12 +1059,12 @@ static int check_bus_affinity_file(void)
 	_STARPU_DEBUG("loading affinities from %s\n", path);
 
 	f = fopen(path, "r");
-	STARPU_ASSERT(f);
+	STARPU_ASSERT_MSG(f, "Error when reading from file '%s'", path);
 
 	locked = _starpu_frdlock(f) == 0;
 
 	ret = fscanf(f, "# GPU\t");
-	STARPU_ASSERT(ret == 0);
+	STARPU_ASSERT_MSG(ret == 0, "Error when reading from file '%s'", path);
 
 	ret = fscanf(f, "NUMA%u\t", &dummy);
 
@@ -1677,7 +1677,7 @@ static void write_bus_bandwidth_file_content(void)
 	_STARPU_DEBUG("writing bandwidth to %s\n", path);
 
 	f = fopen(path, "w+");
-	STARPU_ASSERT(f);
+	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
 
 	locked = _starpu_fwrlock(f) == 0;
 	_starpu_fftruncate(f, 0);
@@ -2045,24 +2045,24 @@ static void check_bus_config_file(void)
 
 		// Loading configuration from file
 		f = fopen(path, "r");
-		STARPU_ASSERT(f);
+		STARPU_ASSERT_MSG(f, "Error when reading from file '%s'", path);
 		locked = _starpu_frdlock(f) == 0;
 		_starpu_drop_comments(f);
 
 		ret = fscanf(f, "%u\t", &read_cpus);
-		STARPU_ASSERT(ret == 1);
+		STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 		_starpu_drop_comments(f);
 
 		ret = fscanf(f, "%u\t", &read_numa);
-		STARPU_ASSERT(ret == 1);
+		STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 		_starpu_drop_comments(f);
 
 		ret = fscanf(f, "%u\t", &read_cuda);
-		STARPU_ASSERT(ret == 1);
+		STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 		_starpu_drop_comments(f);
 
 		ret = fscanf(f, "%u\t", &read_opencl);
-		STARPU_ASSERT(ret == 1);
+		STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 		_starpu_drop_comments(f);
 
 		ret = fscanf(f, "%u\t", &read_mic);
@@ -2117,7 +2117,7 @@ static void write_bus_config_file_content(void)
 	_STARPU_DEBUG("writing config to %s\n", path);
 
 	f = fopen(path, "w+");
-	STARPU_ASSERT(f);
+	STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
 	locked = _starpu_fwrlock(f) == 0;
 	_starpu_fftruncate(f, 0);
 

+ 1 - 1
src/core/simgrid.h

@@ -73,7 +73,7 @@ union _starpu_async_channel_event;
 int _starpu_simgrid_wait_transfer_event(union _starpu_async_channel_event *event);
 int _starpu_simgrid_test_transfer_event(union _starpu_async_channel_event *event);
 void _starpu_simgrid_sync_gpus(void);
-/* Return the number of hosts prefixed by PREFIX */
+/** Return the number of hosts prefixed by PREFIX */
 int _starpu_simgrid_get_nbhosts(const char *prefix);
 unsigned long long _starpu_simgrid_get_memsize(const char *prefix, unsigned devid);
 starpu_sg_host_t _starpu_simgrid_get_host_by_name(const char *name);

+ 6 - 3
src/core/topology.h

@@ -31,9 +31,12 @@ struct _starpu_machine_config;
 /** This is allocated for each hwloc object */
 struct _starpu_hwloc_userdata
 {
-	struct _starpu_worker_list *worker_list; /** List of workers running on this obj */
-	unsigned ngpus; /** Number of GPUs sharing this PCI link */
-	struct _starpu_worker *pu_worker; /** Worker running this PU */
+	 /** List of workers running on this obj */
+	struct _starpu_worker_list *worker_list;
+	 /** Number of GPUs sharing this PCI link */
+	unsigned ngpus;
+	/** Worker running this PU */
+	struct _starpu_worker *pu_worker;
 };
 #endif
 #endif

+ 6 - 6
src/core/workers.h

@@ -204,7 +204,7 @@ LIST_TYPE(_starpu_worker,
 	int enable_knob;
 	int bindid_requested;
 
-	/* Keep this last, to make sure to separate worker data in separate
+	  /** Keep this last, to make sure to separate worker data in separate
 	  cache lines. */
 	char padding[STARPU_CACHELINE_SIZE];
 );
@@ -228,7 +228,7 @@ struct _starpu_combined_worker
 	hwloc_bitmap_t hwloc_cpu_set;
 #endif
 
-	/* Keep this last, to make sure to separate worker data in separate
+	/** Keep this last, to make sure to separate worker data in separate
 	  cache lines. */
 	char padding[STARPU_CACHELINE_SIZE];
 };
@@ -397,7 +397,7 @@ struct _starpu_machine_config
 	/** Memory node for MPI, if only one */
 	int mpi_nodeid;
 
-	/* Separate out previous variables from per-worker data. */
+	/** Separate out previous variables from per-worker data. */
 	char padding1[STARPU_CACHELINE_SIZE];
 
 	/** Basic workers : each of this worker is running its own driver and
@@ -410,7 +410,7 @@ struct _starpu_machine_config
 
 	starpu_pthread_mutex_t submitted_mutex;
 
-	/* Separate out previous mutex from the rest of the data. */
+	/** Separate out previous mutex from the rest of the data. */
 	char padding2[STARPU_CACHELINE_SIZE];
 
 	/** Translation table from bindid to worker IDs */
@@ -1201,8 +1201,8 @@ void _starpu_worker_refuse_task(struct _starpu_worker *worker, struct starpu_tas
 void _starpu_set_catch_signals(int do_catch_signal);
 int _starpu_get_catch_signals(void);
 
-/* Performance Monitoring */
-static inline int _starpu_perf_counter_paused(void) 
+/** Performance Monitoring */
+static inline int _starpu_perf_counter_paused(void)
 {
 	STARPU_RMB();
 	return STARPU_UNLIKELY(_starpu_config.perf_counter_pause_depth > 0);

+ 3 - 1
src/datawizard/coherency.h

@@ -145,8 +145,10 @@ struct _starpu_data_state
 	struct _starpu_data_state *root_handle; /** root of the tree */
 	struct _starpu_data_state *father_handle; /** father of the node, NULL if the current node is the root */
 	starpu_data_handle_t *active_children; /** The currently active set of read-write children */
+	unsigned active_nchildren;
 	starpu_data_handle_t **active_readonly_children; /** The currently active set of read-only children */
-	unsigned nactive_readonly_children; /** Size of active_readonly_children array */
+	unsigned *active_readonly_nchildren; /** Size of active_readonly_children[i] array */
+	unsigned nactive_readonly_children; /** Size of active_readonly_children and active_readonly_nchildren arrays. Actual use is given by 'partitioned' */
 	/** Our siblings in the father partitioning */
 	unsigned nsiblings; /** How many siblings */
 	starpu_data_handle_t *siblings;

+ 8 - 4
src/datawizard/copy_driver.h

@@ -49,10 +49,14 @@ struct _starpu_data_replicate;
 
 enum _starpu_is_prefetch
 {
-	STARPU_FETCH = 0,		/* A task really needs it now! */
-	STARPU_TASK_PREFETCH = 1,	/* A task will need it soon */
-	STARPU_PREFETCH = 2,		/* It is a good idea to have it asap */
-	STARPU_IDLEFETCH = 3,		/* Get this here when you have time to */
+ 	/** A task really needs it now! */
+ 	STARPU_FETCH = 0,
+	/** A task will need it soon */
+	STARPU_TASK_PREFETCH = 1,
+	/** It is a good idea to have it asap */
+	STARPU_PREFETCH = 2,
+	/** Get this here when you have time to */
+	STARPU_IDLEFETCH = 3,
 	STARPU_NFETCH
 };
 

+ 2 - 1
src/datawizard/data_request.h

@@ -35,7 +35,8 @@
 #define MAX_PENDING_REQUESTS_PER_NODE 20
 #define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
 #define MAX_PENDING_IDLE_REQUESTS_PER_NODE 1
-#define MAX_PUSH_TIME 1000 /* Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
+/** Maximum time in us that we can afford pushing requests before going back to the driver loop, e.g. for checking GPU task termination */
+#define MAX_PUSH_TIME 1000
 
 struct _starpu_data_replicate;
 

+ 9 - 0
src/datawizard/filters.c

@@ -651,6 +651,7 @@ void _starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned
 	STARPU_ASSERT_MSG(initial_handle->part_readonly == 0, "One can't submit a partition planning while a readonly partitioning is active");
 	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	initial_handle->partitioned++;
+	initial_handle->active_nchildren = children[0]->nsiblings;
 	initial_handle->active_children = children[0]->siblings;
 	_starpu_spin_unlock(&initial_handle->header_lock);
 
@@ -715,9 +716,11 @@ void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle,
 	if (initial_handle->nactive_readonly_children < initial_handle->partitioned)
 	{
 		_STARPU_REALLOC(initial_handle->active_readonly_children, initial_handle->partitioned * sizeof(initial_handle->active_readonly_children[0]));
+		_STARPU_REALLOC(initial_handle->active_readonly_nchildren, initial_handle->partitioned * sizeof(initial_handle->active_readonly_nchildren[0]));
 		initial_handle->nactive_readonly_children = initial_handle->partitioned;
 	}
 	initial_handle->active_readonly_children[initial_handle->partitioned-1] = children[0]->siblings;
+	initial_handle->active_readonly_nchildren[initial_handle->partitioned-1] = children[0]->nsiblings;
 	_starpu_spin_unlock(&initial_handle->header_lock);
 
 	for (i = 0; i < nparts; i++)
@@ -748,8 +751,10 @@ void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial
 	STARPU_ASSERT_MSG(initial_handle->part_readonly == 1, "One can only upgrade a readonly partition planning");
 	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	initial_handle->part_readonly = 0;
+	initial_handle->active_nchildren = initial_handle->active_readonly_nchildren[0];
 	initial_handle->active_children = initial_handle->active_readonly_children[0];
 	initial_handle->active_readonly_children[0] = NULL;
+	initial_handle->active_readonly_nchildren[0] = 0;
 	_starpu_spin_unlock(&initial_handle->header_lock);
 
 	unsigned i;
@@ -782,18 +787,22 @@ void _starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsign
 			if (initial_handle->active_readonly_children[i] == children[0]->siblings)
 			{
 				initial_handle->active_readonly_children[i] = initial_handle->active_readonly_children[initial_handle->partitioned-1];
+				initial_handle->active_readonly_nchildren[i] = initial_handle->active_readonly_nchildren[initial_handle->partitioned-1];
 				initial_handle->active_readonly_children[initial_handle->partitioned-1] = NULL;
+				initial_handle->active_readonly_nchildren[initial_handle->partitioned-1] = 0;
 				break;
 			}
 		}
 	}
 	else
 	{
+		initial_handle->active_nchildren = 0;
 		initial_handle->active_children = NULL;
 	}
 	initial_handle->partitioned--;
 	if (!initial_handle->partitioned)
 		initial_handle->part_readonly = 0;
+	initial_handle->active_nchildren = 0;
 	initial_handle->active_children = NULL;
 	_starpu_spin_unlock(&initial_handle->header_lock);
 

+ 3 - 0
src/datawizard/interfaces/data_interface.c

@@ -415,7 +415,9 @@ int _starpu_data_handle_init(starpu_data_handle_t handle, struct starpu_data_int
 	//handle->root_handle
 	//handle->father_handle
 	//handle->active_children = NULL;
+	//handle->active_nchildren = 0;
 	//handle->active_readonly_children = NULL;
+	//handle->active_readonly_nchildren = NULL;
 	//handle->nactive_readonly_children = 0;
 	//handle->nsiblings
 	//handle->siblings
@@ -1025,6 +1027,7 @@ retry_busy:
 
 	_starpu_data_clear_implicit(handle);
 	free(handle->active_readonly_children);
+	free(handle->active_readonly_nchildren);
 
 	STARPU_PTHREAD_MUTEX_DESTROY(&handle->busy_mutex);
 	STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);

+ 2 - 2
src/datawizard/memalloc.h

@@ -59,8 +59,8 @@ LIST_TYPE(_starpu_mem_chunk,
 	unsigned home:1;
 	/** Whether the memchunk is in the clean part of the mc_list */
 	unsigned clean:1;
-	/** Was this chunk used since it got allocated?  */
-	/* FIXME: probably useless now with nb_tasks_prefetch */
+	/** Was this chunk used since it got allocated?
+	    FIXME: probably useless now with nb_tasks_prefetch */
 	unsigned diduse:1;
 	/** Was this chunk marked as "won't use"? */
 	unsigned wontuse:1;

+ 3 - 3
src/datawizard/memory_nodes.h

@@ -45,7 +45,7 @@ struct _starpu_memory_node_descr
 	enum starpu_node_kind nodes[STARPU_MAXNODES];
 	struct _starpu_node_ops *node_ops[STARPU_MAXNODES];
 
-	/* Get the device id associated to this node, or -1 if not applicable */
+	/** Get the device id associated to this node, or -1 if not applicable */
 	int devid[STARPU_MAXNODES];
 
 	unsigned nworkers[STARPU_MAXNODES];
@@ -55,7 +55,7 @@ struct _starpu_memory_node_descr
 #endif
 
 	// TODO move this 2 lists outside struct _starpu_memory_node_descr
-	/* Every worker is associated to a condition variable on which the
+	/** Every worker is associated to a condition variable on which the
 	 * worker waits when there is task available. It is possible that
 	 * multiple worker share the same condition variable, so we maintain a
 	 * list of all these condition variables so that we can wake up all
@@ -63,7 +63,7 @@ struct _starpu_memory_node_descr
 	starpu_pthread_rwlock_t conditions_rwlock;
 	struct _starpu_cond_and_worker conditions_attached_to_node[STARPU_MAXNODES][STARPU_NMAXWORKERS];
 	struct _starpu_cond_and_worker conditions_all[STARPU_MAXNODES*STARPU_NMAXWORKERS];
-	/* the number of queues attached to each node */
+	/** the number of queues attached to each node */
 	unsigned total_condition_count;
 	unsigned condition_count[STARPU_MAXNODES];
 };

+ 1 - 1
src/datawizard/memstats.h

@@ -25,7 +25,7 @@
 #ifdef STARPU_MEMORY_STATS
 struct _starpu_memory_stats
 {
-	/* Handle access stats per node */
+	/** Handle access stats per node */
 	unsigned direct_access[STARPU_MAXNODES];
 	unsigned loaded_shared[STARPU_MAXNODES];
 	unsigned loaded_owner[STARPU_MAXNODES];

+ 28 - 0
src/datawizard/user_interactions.c

@@ -689,6 +689,34 @@ void starpu_data_wont_use(starpu_data_handle_t handle)
 	if (!handle->initialized)
 		/* No value atm actually */
 		return;
+
+	if (starpu_data_get_nb_children(handle) != 0)
+	{
+		int i;
+		for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
+			starpu_data_wont_use(starpu_data_get_child(handle, i));
+		return;
+	}
+
+	if (handle->partitioned != 0)
+	{
+		unsigned i;
+		for(i=0 ; i<handle->partitioned; i++)
+		{
+			unsigned j;
+			for(j=0 ; j<handle->active_readonly_nchildren[i] ; j++)
+				starpu_data_wont_use(handle->active_readonly_children[i][j]);
+		}
+	}
+
+	if (handle->active_nchildren != 0)
+	{
+		unsigned j;
+		for(j=0 ; j<handle->active_nchildren ; j++)
+			starpu_data_wont_use(handle->active_children[j]);
+		return;
+	}
+
 	_STARPU_TRACE_DATA_WONT_USE(handle);
 	starpu_data_acquire_on_node_cb_sequential_consistency_quick(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle, 1, 1);
 }

+ 1 - 1
src/drivers/mic/driver_mic_source.h

@@ -34,7 +34,7 @@
 
 extern struct _starpu_node_ops _starpu_driver_mic_node_ops;
 
-/* Array of structures containing all the informations useful to send
+/** Array of structures containing all the informations useful to send
  * and receive informations with devices */
 extern struct _starpu_mp_node *_starpu_mic_nodes[STARPU_MAXMICDEVS];
 

+ 2 - 2
src/drivers/mp_common/mp_common.h

@@ -57,12 +57,12 @@ enum _starpu_mp_command
 	STARPU_MP_COMMAND_ANSWER_ALLOCATE,
 	STARPU_MP_COMMAND_ERROR_ALLOCATE,
 	STARPU_MP_COMMAND_FREE,
-        /* Synchronous send */
+        /** Synchronous send */
 	STARPU_MP_COMMAND_RECV_FROM_HOST,
 	STARPU_MP_COMMAND_SEND_TO_HOST,
 	STARPU_MP_COMMAND_RECV_FROM_SINK,
 	STARPU_MP_COMMAND_SEND_TO_SINK,
-        /* Asynchronous send */
+        /** Asynchronous send */
         STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC,
         STARPU_MP_COMMAND_RECV_FROM_HOST_ASYNC_COMPLETED,
 	STARPU_MP_COMMAND_SEND_TO_HOST_ASYNC,

+ 30 - 25
src/sched_policies/component_work_stealing.c

@@ -30,14 +30,20 @@
 #warning TODO: locality work-stealing
 #endif
 
+struct _starpu_component_work_stealing_data_per_worker
+{
+	struct _starpu_prio_deque fifo;
+	unsigned last_pop_child;
+};
+
 struct _starpu_component_work_stealing_data
 {
 /* keep track of the work performed from the beginning of the algorithm to make
  * better decisions about which queue to child when stealing or deferring work
  */
-	unsigned performed_total, last_pop_child, last_push_child;
+	struct _starpu_component_work_stealing_data_per_worker *per_worker;
+	unsigned performed_total, last_push_child;
 
-	struct _starpu_prio_deque * fifos;
 	starpu_pthread_mutex_t ** mutexes;
 	unsigned size;
 };
@@ -50,16 +56,14 @@ struct _starpu_component_work_stealing_data
 static struct starpu_task *  steal_task_round_robin(struct starpu_sched_component *component, int workerid)
 {
 	struct _starpu_component_work_stealing_data *wsd = component->data;
-	STARPU_HG_DISABLE_CHECKING(wsd->last_pop_child);
-	unsigned i = wsd->last_pop_child;
-	wsd->last_pop_child = (i + 1) % component->nchildren;
-	STARPU_HG_ENABLE_CHECKING(wsd->last_pop_child);
+	unsigned i = wsd->per_worker[workerid].last_pop_child;
+	wsd->per_worker[workerid].last_pop_child = (i + 1) % component->nchildren;
 	/* If the worker's queue have no suitable tasks, let's try
 	 * the next ones */
 	struct starpu_task * task = NULL;
 	while (1)
 	{
-		struct _starpu_prio_deque * fifo = &wsd->fifos[i];
+		struct _starpu_prio_deque * fifo = &wsd->per_worker[i].fifo;
 
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 		task = _starpu_prio_deque_deque_task_for_worker(fifo, workerid, NULL);
@@ -75,7 +79,7 @@ static struct starpu_task *  steal_task_round_robin(struct starpu_sched_componen
 			break;
 		}
 
-		if (i == wsd->last_pop_child)
+		if (i == wsd->per_worker[workerid].last_pop_child)
 		{
 			/* We got back to the first worker,
 			 * don't go in infinite loop */
@@ -141,17 +145,17 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 	struct _starpu_component_work_stealing_data * wsd = component->data;
 	const double now = starpu_timing_now();
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->fifos[i]);
+	struct starpu_task * task = _starpu_prio_deque_pop_task(&wsd->per_worker[i].fifo);
 	if(task)
 	{
 		if(!isnan(task->predicted))
 		{
-			wsd->fifos[i].exp_len -= task->predicted;
-			wsd->fifos[i].exp_start = now + task->predicted;
+			wsd->per_worker[i].fifo.exp_len -= task->predicted;
+			wsd->per_worker[i].fifo.exp_start = now + task->predicted;
 		}
 	}
 	else
-		wsd->fifos[i].exp_len = 0.0;
+		wsd->per_worker[i].fifo.exp_len = 0.0;
 
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	if(task)
@@ -163,7 +167,7 @@ static struct starpu_task * pull_task(struct starpu_sched_component * component,
 	if(task)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		wsd->fifos[i].nprocessed++;
+		wsd->per_worker[i].fifo.nprocessed++;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 		return task;
@@ -196,9 +200,9 @@ double _ws_estimated_end(struct starpu_sched_component * component)
 	for(i = 0; i < component->nchildren; i++)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		sum_len += wsd->fifos[i].exp_len;
-		wsd->fifos[i].exp_start = STARPU_MAX(now, wsd->fifos[i].exp_start);
-		sum_start += wsd->fifos[i].exp_start;
+		sum_len += wsd->per_worker[i].fifo.exp_len;
+		wsd->per_worker[i].fifo.exp_start = STARPU_MAX(now, wsd->per_worker[i].fifo.exp_start);
+		sum_start += wsd->per_worker[i].fifo.exp_start;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 	}
@@ -216,7 +220,7 @@ double _ws_estimated_load(struct starpu_sched_component * component)
 	for(i = 0; i < component->nchildren; i++)
 	{
 		STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-		ntasks += wsd->fifos[i].ntasks;
+		ntasks += wsd->per_worker[i].fifo.ntasks;
 		STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 	}
 	double speedup = 0.0;
@@ -265,7 +269,7 @@ static int push_task(struct starpu_sched_component * component, struct starpu_ta
 
 	STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
 	starpu_sched_task_break(task);
-	ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i], task);
+	ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo, task);
 	STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 	wsd->last_push_child = i;
@@ -308,9 +312,9 @@ int starpu_sched_tree_work_stealing_push_task(struct starpu_task *task)
 
 			struct _starpu_component_work_stealing_data * wsd = component->data;
 			STARPU_COMPONENT_MUTEX_LOCK(wsd->mutexes[i]);
-			int ret = _starpu_prio_deque_push_front_task(&wsd->fifos[i] , task);
+			int ret = _starpu_prio_deque_push_front_task(&wsd->per_worker[i].fifo , task);
 			if(ret == 0 && !isnan(task->predicted))
-				wsd->fifos[i].exp_len += task->predicted;
+				wsd->per_worker[i].fifo.exp_len += task->predicted;
 			STARPU_COMPONENT_MUTEX_UNLOCK(wsd->mutexes[i]);
 
 			component->can_pull(component);
@@ -329,12 +333,13 @@ void _ws_add_child(struct starpu_sched_component * component, struct starpu_sche
 	if(wsd->size < component->nchildren)
 	{
 		STARPU_ASSERT(wsd->size == component->nchildren - 1);
-		_STARPU_REALLOC(wsd->fifos, component->nchildren * sizeof(*wsd->fifos));
+		_STARPU_REALLOC(wsd->per_worker, component->nchildren * sizeof(*wsd->per_worker));
 		_STARPU_REALLOC(wsd->mutexes, component->nchildren * sizeof(*wsd->mutexes));
 		wsd->size = component->nchildren;
 	}
 
-	_starpu_prio_deque_init(&wsd->fifos[component->nchildren - 1]);
+	wsd->per_worker[component->nchildren - 1].last_pop_child = 0;
+	_starpu_prio_deque_init(&wsd->per_worker[component->nchildren - 1].fifo);
 
 	starpu_pthread_mutex_t *mutex;
 	_STARPU_MALLOC(mutex, sizeof(*mutex));
@@ -356,8 +361,8 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 			break;
 	}
 	STARPU_ASSERT(i_component != component->nchildren);
-	struct _starpu_prio_deque tmp_fifo = wsd->fifos[i_component];
-	wsd->fifos[i_component] = wsd->fifos[component->nchildren - 1];
+	struct _starpu_prio_deque tmp_fifo = wsd->per_worker[i_component].fifo;
+	wsd->per_worker[i_component].fifo = wsd->per_worker[component->nchildren - 1].fifo;
 
 
 	component->children[i_component] = component->children[component->nchildren - 1];
@@ -372,7 +377,7 @@ void _ws_remove_child(struct starpu_sched_component * component, struct starpu_s
 void _work_stealing_component_deinit_data(struct starpu_sched_component * component)
 {
 	struct _starpu_component_work_stealing_data * wsd = component->data;
-	free(wsd->fifos);
+	free(wsd->per_worker);
 	free(wsd->mutexes);
 	free(wsd);
 }

+ 1 - 1
src/sched_policies/helper_mct.h

@@ -23,7 +23,7 @@ struct _starpu_mct_data
 	double beta;
 	double _gamma;
 	double idle_power;
-	starpu_pthread_mutex_t scheduling_mutex; 
+	starpu_pthread_mutex_t scheduling_mutex;
 };
 
 struct _starpu_mct_data *starpu_mct_init_parameters(struct starpu_sched_component_mct_data *params);

+ 1 - 1
src/sched_policies/prio_deque.h

@@ -27,7 +27,7 @@ struct _starpu_prio_deque
 	struct starpu_task_prio_list list;
 	unsigned ntasks;
 	unsigned nprocessed;
-	// Assumptions: 
+	// Assumptions:
 	// exp_len is the sum of predicted_length + predicted_tansfer of all tasks in list
 	// exp_start is the time at which the first task of list can start
 	// exp_end = exp_start + exp_end

+ 1 - 1
src/sched_policies/sched_component.h

@@ -23,7 +23,7 @@
 #include <starpu_sched_component.h>
 
 
-/* lock and unlock drivers for modifying schedulers */
+/** lock and unlock drivers for modifying schedulers */
 void _starpu_sched_component_lock_all_workers(void);
 void _starpu_sched_component_unlock_all_workers(void);
 

+ 15 - 10
src/sched_policies/work_stealing_policy.c

@@ -82,6 +82,11 @@ struct _starpu_work_stealing_data_per_worker
 	int *proxlist;
 	int busy;	/* Whether this worker is working on a task */
 
+	/* keep track of the work performed from the beginning of the algorithm to make
+	 * better decisions about which queue to select when deferring work
+	 */
+	unsigned last_pop_worker;
+
 #ifdef USE_LOCALITY_TASKS
 	/* This records the same as queue, but hashed by data accessed with locality flag.  */
 	/* FIXME: we record only one task per data, assuming that the access is
@@ -99,9 +104,8 @@ struct _starpu_work_stealing_data
 	int (*select_victim)(struct _starpu_work_stealing_data *, unsigned, int);
 	struct _starpu_work_stealing_data_per_worker *per_worker;
 	/* keep track of the work performed from the beginning of the algorithm to make
-	 * better decisions about which queue to select when stealing or deferring work
+	 * better decisions about which queue to select when deferring work
 	 */
-	unsigned last_pop_worker;
 	unsigned last_push_worker;
 };
 
@@ -124,7 +128,8 @@ static int calibration_value = 0;
  */
 static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsigned sched_ctx_id)
 {
-	unsigned worker = ws->last_pop_worker;
+	unsigned workerid = starpu_worker_get_id_check();
+	unsigned worker = ws->per_worker[workerid].last_pop_worker;
 	unsigned nworkers;
 	int *workerids = NULL;
 	nworkers = starpu_sched_ctx_get_workers_list_raw(sched_ctx_id, &workerids);
@@ -140,12 +145,14 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 		if (!ws->per_worker[workerids[worker]].notask)
 		{
 			if (ws->per_worker[workerids[worker]].busy
-						   || starpu_worker_is_blocked_in_parallel(workerids[worker]))
+						   || starpu_worker_is_blocked_in_parallel(workerids[worker])) {
+				ntasks = 1;
 				break;
+			}
 		}
 
 		worker = (worker + 1) % nworkers;
-		if (worker == ws->last_pop_worker)
+		if (worker == ws->per_worker[workerid].last_pop_worker)
 		{
 			/* We got back to the first worker,
 			 * don't go in infinite loop */
@@ -154,7 +161,7 @@ static int select_victim_round_robin(struct _starpu_work_stealing_data *ws, unsi
 		}
 	}
 
-	ws->last_pop_worker = (worker + 1) % nworkers;
+	ws->per_worker[workerid].last_pop_worker = (worker + 1) % nworkers;
 
 	worker = workerids[worker];
 
@@ -353,7 +360,7 @@ static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, i
 	else
 		task = _starpu_prio_deque_pop_task_for_worker(&data_source->queue, target, &skipped);
 
-	if (!data_source->queue.ntasks)
+	if (task && !data_source->queue.ntasks)
 	{
 		STARPU_ASSERT(ws->per_worker[source].notask == 0);
 		ws->per_worker[source].notask = 1;
@@ -400,7 +407,7 @@ static struct starpu_task *ws_pick_task(struct _starpu_work_stealing_data *ws, i
 	else
 		task = _starpu_prio_deque_pop_task_for_worker(&ws->per_worker[source].queue, target, &skipped);
 
-	if (!ws->per_worker[source].queue.ntasks)
+	if (task && !ws->per_worker[source].queue.ntasks)
 	{
 		STARPU_ASSERT(ws->per_worker[source].notask == 0);
 		ws->per_worker[source].notask = 1;
@@ -748,9 +755,7 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 	_STARPU_MALLOC(ws, sizeof(struct _starpu_work_stealing_data));
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
 
-	ws->last_pop_worker = 0;
 	ws->last_push_worker = 0;
-	STARPU_HG_DISABLE_CHECKING(ws->last_pop_worker);
 	STARPU_HG_DISABLE_CHECKING(ws->last_push_worker);
 	ws->select_victim = select_victim;
 

+ 5 - 5
src/util/starpu_clusters_create.h

@@ -79,7 +79,7 @@ LIST_TYPE(_starpu_cluster,
 )
 
 
-/* Machine discovery and cluster creation main funcitons */
+/** Machine discovery and cluster creation main funcitons */
 int _starpu_cluster_machine(hwloc_obj_type_t cluster_level,
 			     struct starpu_cluster_machine* machine);
 int _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
@@ -88,13 +88,13 @@ void _starpu_cluster_group(hwloc_obj_type_t cluster_level,
 			   struct starpu_cluster_machine* machine);
 void _starpu_cluster(struct _starpu_cluster_group* group);
 
-/* Parameter functions */
+/** Parameter functions */
 void _starpu_cluster_init_parameters(struct _starpu_cluster_parameters* globals);
 void _starpu_cluster_copy_parameters(struct _starpu_cluster_parameters* src,
 				     struct _starpu_cluster_parameters* dst);
 int _starpu_cluster_analyze_parameters(struct _starpu_cluster_parameters* params, int npus);
 
-/* Cluster helper functions */
+/** Cluster helper functions */
 void _starpu_cluster_init(struct _starpu_cluster* cluster, struct _starpu_cluster_group* father);
 void _starpu_cluster_create(struct _starpu_cluster* cluster);
 
@@ -102,14 +102,14 @@ int _starpu_cluster_bind(struct _starpu_cluster* cluster);
 int _starpu_cluster_remove(struct _starpu_cluster_list* cluster_list,
 			   struct _starpu_cluster* cluster);
 
-/* Cluster group helper function */
+/** Cluster group helper function */
 void _starpu_cluster_group_init(struct _starpu_cluster_group* group,
 				struct starpu_cluster_machine* father);
 void _starpu_cluster_group_create(struct _starpu_cluster_group* group);
 int _starpu_cluster_group_remove(struct _starpu_cluster_group_list* group_list,
 				 struct _starpu_cluster_group* group);
 
-/* Binding helpers */
+/** Binding helpers */
 void _starpu_cluster_noop(void* buffers[], void* cl_arg)
 {
 	(void) buffers;

+ 2 - 1
starpurm/examples/Makefile.am

@@ -13,6 +13,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
+include $(top_srcdir)/starpu.mk
 SUBDIRS =
 
 CLEANFILES = *.gcno *.gcda *.linkinfo
@@ -20,7 +21,7 @@ CLEANFILES = *.gcno *.gcda *.linkinfo
 AM_CPPFLAGS	= -I$(top_srcdir)/include -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_builddir)/include
 AM_CPPFLAGS	+= -I$(top_srcdir)/starpurm/include -I$(top_srcdir)/starpurm/src -I$(top_builddir)/starpurm/src -I$(top_builddir)/starpurm/include
 AM_CFLAGS	= -Wall -g $(HWLOC_CFLAGS) $(DLB_CFLAGS)
-LDADD	= $(top_builddir)/starpurm/src/libstarpurm-@STARPU_EFFECTIVE_VERSION@.la $(HWLOC_LIBS) $(DLB_LIBS)
+LDADD	= $(top_builddir)/starpurm/src/libstarpurm-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) $(DLB_LIBS)
 
 examplebindir = $(libdir)/starpu/examples/starpurm
 

+ 29 - 26
starpurm/src/starpurm_private.h

@@ -17,7 +17,10 @@
 #ifndef __STARPURM_PRIVATE_H
 #define __STARPURM_PRIVATE_H
 
-enum e_state {
+/** @file */
+
+enum e_state
+{
 	state_uninitialized = 0,
 	state_init
 };
@@ -33,86 +36,86 @@ enum e_starpurm_unit_type
 
 struct s_starpurm
 {
-	/* Machine topology as detected by hwloc. */
+	/** Machine topology as detected by hwloc. */
 	hwloc_topology_t topology;
 
-	/* Current upper bound on the number of CPU cores selectable for computing with the runtime system. */
+	/** Current upper bound on the number of CPU cores selectable for computing with the runtime system. */
 	unsigned max_ncpus;
 
-	/* Number of currently selected CPU workers */
+	/** Number of currently selected CPU workers */
 	unsigned selected_ncpus;
 
-	/* Number of currently selected workers (CPU+devices) */
+	/** Number of currently selected workers (CPU+devices) */
 	unsigned selected_nworkers;
 
-	/* Initialization state of the RM instance. */
+	/** Initialization state of the RM instance. */
 	int state;
 
-	/* Boolean indicating the state of the dynamic resource sharing layer.
+	/** Boolean indicating the state of the dynamic resource sharing layer.
 	 *
 	 * !0 indicates that dynamic resource sharing is enabled.
 	 * 0 indicates that dynamic resource sharing is disabled.
 	 */
 	int dynamic_resource_sharing;
 
-	/* Id of the StarPU's sched_ctx used by the RM instance. */
+	/** Id of the StarPU's sched_ctx used by the RM instance. */
 	unsigned sched_ctx_id;
 
-	/* Number of unit types supported by this RM instance. */
+	/** Number of unit types supported by this RM instance. */
 	int unit_ntypes;
 
-	/* Number of unitss available for each type. */
+	/** Number of unitss available for each type. */
 	int *nunits_by_type;
 
-	/* Number of units. */
+	/** Number of units. */
 	int nunits;
 
-	/* Offset of unit numbering for each type. */
+	/** Offset of unit numbering for each type. */
 	int *unit_offsets_by_type;
 
-	/* Array of units. */
+	/** Array of units. */
 	struct s_starpurm_unit *units;
 
-	/* Cpuset of all the StarPU's workers (CPU+devices. */
+	/** Cpuset of all the StarPU's workers (CPU+devices. */
 	hwloc_cpuset_t global_cpuset;
 
-	/* Cpuset of all StarPU CPU workers. */
+	/** Cpuset of all StarPU CPU workers. */
 	hwloc_cpuset_t all_cpu_workers_cpuset;
 
-	/* Cpuset of all StarPU OpenCL workers. */
+	/** Cpuset of all StarPU OpenCL workers. */
 	hwloc_cpuset_t all_opencl_device_workers_cpuset;
 
-	/* Cpuset of all StarPU CUDA workers. */
+	/** Cpuset of all StarPU CUDA workers. */
 	hwloc_cpuset_t all_cuda_device_workers_cpuset;
 
-	/* Cpuset of all StarPU MIC workers. */
+	/** Cpuset of all StarPU MIC workers. */
 	hwloc_cpuset_t all_mic_device_workers_cpuset;
 
-	/* Cpuset of all StarPU device workers. */
+	/** Cpuset of all StarPU device workers. */
 	hwloc_cpuset_t all_device_workers_cpuset;
 
-	/* Cpuset of all selected workers (CPU+devices). */
+	/** Cpuset of all selected workers (CPU+devices). */
 	hwloc_cpuset_t selected_cpuset;
 
-	/* Cpuset mask of initially owned cpuset or full if not used. */
+	/** Cpuset mask of initially owned cpuset or full if not used. */
 	hwloc_cpuset_t initially_owned_cpuset_mask;
 
-	/* maximum value among worker ids */
+	/** maximum value among worker ids */
 	int max_worker_id;
 
-	/* worker id to unit id table */
+	/** worker id to unit id table */
 	int *worker_unit_ids;
 
-	/* Temporary contexts accounting. */
+	/** Temporary contexts accounting. */
 	unsigned int max_temporary_ctxs;
 	unsigned int avail_temporary_ctxs;
 	pthread_mutex_t temporary_ctxs_mutex;
 	pthread_cond_t temporary_ctxs_cond;
 
-	/* Global StarPU pause state */
+	/** Global StarPU pause state */
 	int starpu_in_pause;
 
-	/* Event list. */
+	/** Event list. */
 	pthread_t event_thread;
 	pthread_mutex_t event_list_mutex;
 	pthread_cond_t event_list_cond;

+ 6 - 4
tests/Makefile.am

@@ -74,7 +74,7 @@ EXTRA_DIST =					\
 	model-checking/starpu-mc.sh.in
 
 CLEANFILES = 					\
-	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 bandwidth-*.dat
+	*.gcno *.gcda *.linkinfo core starpu_idle_microsec.log *.mod *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 bandwidth-*.dat bandwidth.gp bandwidth.eps bandwidth.svg
 
 BUILT_SOURCES =
 SUBDIRS =
@@ -174,7 +174,6 @@ myPROGRAMS +=					\
 	helper/execute_on_all			\
 	microbenchs/display_structures_size	\
 	microbenchs/local_pingpong		\
-	microbenchs/bandwidth			\
 	overlap/overlap				\
 	sched_ctx/sched_ctx_list		\
 	sched_ctx/sched_ctx_policy_data		\
@@ -326,6 +325,7 @@ myPROGRAMS +=				\
 	datawizard/partition_dep   		\
 	datawizard/partition_lazy		\
 	datawizard/partition_init		\
+	datawizard/partition_wontuse		\
 	datawizard/gpu_register   		\
 	datawizard/gpu_ptr_register   		\
 	datawizard/variable_parameters		\
@@ -358,6 +358,7 @@ myPROGRAMS +=				\
 	microbenchs/prefetch_data_on_node 	\
 	microbenchs/redundant_buffer		\
 	microbenchs/matrix_as_vector		\
+	microbenchs/bandwidth			\
 	overlap/gpu_concurrency			\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/parallel_kernels		\
@@ -414,8 +415,7 @@ examplebin_PROGRAMS = \
 	microbenchs/sync_tasks_overhead		\
 	microbenchs/tasks_overhead		\
 	microbenchs/tasks_size_overhead		\
-	microbenchs/local_pingpong		\
-	microbenchs/bandwidth
+	microbenchs/local_pingpong
 examplebin_SCRIPTS = \
 	microbenchs/tasks_data_overhead.sh \
 	microbenchs/sync_tasks_data_overhead.sh \
@@ -424,6 +424,8 @@ examplebin_SCRIPTS = \
 	microbenchs/tasks_size_overhead.sh
 if !STARPU_SIMGRID
 if !STARPU_USE_MPI_MASTER_SLAVE
+examplebin_PROGRAMS += \
+	microbenchs/bandwidth
 SHELL_TESTS += \
 	microbenchs/tasks_data_overhead.sh \
 	microbenchs/sync_tasks_data_overhead.sh \

+ 45 - 0
tests/datawizard/partition_wontuse.c

@@ -0,0 +1,45 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <starpu.h>
+#include "../helper.h"
+
+int main(int argc, char **argv)
+{
+	int ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	struct starpu_data_filter f =
+	{
+	 	.filter_func = starpu_vector_filter_block,
+		.nchildren = 2
+	};
+
+	int v[10];
+	starpu_data_handle_t array_handle;
+	starpu_vector_data_register(&array_handle, STARPU_MAIN_RAM, (uintptr_t)&v, 10, sizeof(int));
+
+	starpu_data_partition(array_handle, &f);
+	starpu_data_wont_use(array_handle);
+	starpu_data_unpartition(array_handle, STARPU_MAIN_RAM);
+
+	starpu_data_unregister(array_handle);
+	starpu_shutdown();
+
+	return 0;
+}

+ 3 - 0
tests/datawizard/temporary_partition.c

@@ -80,6 +80,9 @@ int main(void)
 	/* Invalidate one random piece we don't care coherency about */
 	starpu_data_invalidate_submit(handles[NPARTS/2]);
 
+	/* Try to wontuse the whole thing */
+	starpu_data_wont_use(handle);
+
 	/* Clean */
 	starpu_data_unpartition_submit(handle, NPARTS, handles, -1);
 	starpu_data_partition_clean(handle, NPARTS, handles);

+ 3 - 0
tests/datawizard/temporary_partition_implicit.c

@@ -80,6 +80,9 @@ int main(void)
 	/* Invalidate one random piece we don't care coherency about */
 	starpu_data_invalidate_submit(handles[NPARTS/2]);
 
+	/* Try to wontuse the whole thing */
+	starpu_data_wont_use(handle);
+
 	/* Clean */
 	starpu_data_unpartition_submit(handle, NPARTS, handles, -1);
 	starpu_data_partition_clean(handle, NPARTS, handles);

+ 1 - 0
tests/microbenchs/bandwidth.c

@@ -337,6 +337,7 @@ int main(int argc, char **argv)
 
 	for (n = 0; n < total_ncpus; n++)
 		free(buffers[n]);
+	free(buffers);
 
 	return EXIT_SUCCESS;
 }

+ 2 - 2
tests/microbenchs/bandwidth_scheds.sh

@@ -33,7 +33,7 @@ set output "bandwidth.svg"
 set pointsize 0.3
 EOF
 else
-	fast="-i 3 -c 4"
+	fast="-n 3 -c 4"
 	cat > bandwidth.gp << EOF
 set term postscript eps enhanced color font ",18"
 set output "bandwidth.eps"
@@ -70,6 +70,6 @@ done
 
 if gnuplot bandwidth.gp ; then
 	if [ -n "$STARPU_BENCH_DIR" ]; then
-		cp bandwidth.png $STARPU_BENCH_DIR/
+		cp bandwidth.svg $STARPU_BENCH_DIR/
 	fi
 fi

+ 1 - 0
tools/Makefile.am

@@ -285,6 +285,7 @@ EXTRA_DIST =				\
 	dev/cppcheck/suppressions.txt	\
 	dev/valgrind/bash.suppr		\
 	dev/valgrind/fxt.suppr		\
+	dev/valgrind/glpk.suppr		\
 	dev/valgrind/hdf5.suppr		\
 	dev/valgrind/hwloc.suppr	\
 	dev/valgrind/libc.suppr		\

+ 23 - 0
tools/dev/valgrind/glpk.suppr

@@ -0,0 +1,23 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   ...
+   fun:glp_init_env
+}