Browse Source

Merged from branch 'trunk'

Luka Stanisic 9 years ago
parent
commit
5d51d7ff57
40 changed files with 993 additions and 459 deletions
  1. 1 0
      ChangeLog
  2. 291 168
      configure.ac
  3. 38 11
      doc/doxygen/chapters/320_scheduling.doxy
  4. 5 1
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  5. 7 0
      doc/doxygen/chapters/501_environment_variables.doxy
  6. 2 1
      doc/doxygen/chapters/510_configure_options.doxy
  7. 3 0
      examples/native_fortran/nf_vector.f90
  8. 9 9
      examples/scheduler/heteroprio_test.c
  9. 1 1
      examples/spmv/matrix_market/mmio.c
  10. 55 0
      include/fstarpu_mod.f90
  11. 3 0
      include/starpu_config.h.in
  12. 1 1
      include/starpu_sched_ctx.h
  13. 10 0
      include/starpu_thread.h
  14. 2 0
      mpi/src/starpu_mpi.c
  15. 5 0
      src/common/fxt.c
  16. 4 0
      src/common/fxt.h
  17. 209 122
      src/common/graph.c
  18. 56 2
      src/common/graph.h
  19. 109 0
      src/common/list.h
  20. 29 29
      src/core/jobs.c
  21. 8 90
      src/core/jobs.h
  22. 2 2
      src/core/perfmodel/perfmodel_history.c
  23. 1 1
      src/core/sched_ctx.c
  24. 1 1
      src/core/sched_ctx.h
  25. 1 0
      src/core/task.c
  26. 3 7
      src/core/topology.c
  27. 1 0
      src/drivers/cpu/driver_cpu.c
  28. 5 0
      src/drivers/cuda/driver_cuda.c
  29. 2 3
      src/drivers/mic/driver_mic_common.h
  30. 6 0
      src/drivers/mic/driver_mic_source.c
  31. 1 0
      src/drivers/opencl/driver_opencl.c
  32. 2 0
      src/drivers/opencl/driver_opencl_utils.c
  33. 7 1
      src/drivers/scc/driver_scc_source.c
  34. 26 2
      src/sched_policies/deque_modeling_policy_data_aware.c
  35. 11 5
      src/sched_policies/graph_test_policy.c
  36. 2 2
      src/sched_policies/heteroprio.c
  37. 2 0
      src/top/starpu_top_connection.c
  38. 64 0
      src/util/fstarpu.c
  39. 5 0
      tests/microbenchs/tasks_size_overhead.c
  40. 3 0
      tools/starpu_tasks_rec_complete.c

+ 1 - 0
ChangeLog

@@ -39,6 +39,7 @@ New features:
   * Add starpu_fxt_trace_user_event_string.
   * Add starpu_tasks_rec_complete tool to add estimation times in tasks.rec
     files.
+  * Add STARPU_FXT_TRACE environment variable.
 
 StarPU 1.2.0 (svn revision 18521)
 ==============================================

+ 291 - 168
configure.ac

@@ -258,6 +258,11 @@ fi
 # yes, that's non portable, but it's still better than sched_setaffinity
 AC_CHECK_FUNCS(pthread_setaffinity_np)
 
+AC_CHECK_FUNC([pthread_setname_np], have_pthread_setname_np=yes, have_pthread_setname_np=no)
+if test x$have_pthread_setname_np = xyes; then
+	AC_DEFINE(STARPU_HAVE_PTHREAD_SETNAME_NP,[1],[pthread_setname_np is available])
+fi
+
 # There is no posix_memalign on Mac OS X, only memalign
 AC_CHECK_FUNCS([posix_memalign], [AC_DEFINE([STARPU_HAVE_POSIX_MEMALIGN], [1], [Define to 1 if you have the `posix_memalign' function.])])
 AC_CHECK_FUNCS([memalign], [AC_DEFINE([STARPU_HAVE_MEMALIGN], [1], [Define to 1 if you have the `memalign' function.])])
@@ -335,6 +340,7 @@ fi
 AC_CHECK_FUNC([sched_yield], [AC_DEFINE([STARPU_HAVE_SCHED_YIELD], [1], [Define to 1 if the function sched_yield is available.])])
 
 AC_CHECK_HEADERS([aio.h])
+AC_CHECK_LIB([rt], [aio_read])
 
 AC_CHECK_FUNCS([mkostemp])
 
@@ -1474,42 +1480,6 @@ AC_MSG_RESULT($enable_rcce)
 
 ###############################################################################
 #                                                                             #
-#                             MP Common settings                              #
-#                                                                             #
-###############################################################################
-
-AM_CONDITIONAL([STARPU_USE_MP], [test "x$enable_mic" = "xyes" -o "x$enable_mpi" = "xyes" -o "x$enable_rcce" = "xyes"])
-
-AC_ARG_ENABLE([export-dynamic], [AS_HELP_STRING([--disable-export-dynamic],
-			  [Prevent the linker from adding all symbols to the dynamic symbol table])], [], [])
-
-if test x$enable_mic = xyes -o x$enable_mpi = xyes -o x$enable_rcce = xyes ; then
-	AC_DEFINE(STARPU_USE_MP, [1], [Message-passing SINKs support
-		  is enabled])
-
-	if test x$enable_export_dynamic != xno ; then
-		STARPU_EXPORT_DYNAMIC="-rdynamic"
-	fi
-fi
-
-AC_SUBST(STARPU_EXPORT_DYNAMIC)
-
-# Computes the maximum number of different kernels a message-passing sink
-# can lookup for and launch.
-AC_MSG_CHECKING(Maximum number of message-passing kernels)
-AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
-	      -enable-maxmpkernels=<number>],
-	      [maximum number of kernels a message-passing sink can lookup
-	      for and execute])],
-	      maxmpkernels=$enableval, maxmpkernels=10)
-AC_MSG_RESULT($maxmpkernels)
-AC_DEFINE_UNQUOTED(STARPU_MAXMPKERNELS, [$maxmpkernels],
-		[maximum number of message-passing kernels])
-
-###############################################################################
-
-###############################################################################
-#                                                                             #
 #                   Debug and Performance analysis tools                      #
 #                                                                             #
 ###############################################################################
@@ -1774,6 +1744,238 @@ AM_CONDITIONAL([STARPU_USE_AYUDAME2], [test "x$enable_ayudame2" = "xyes"])
 
 ###############################################################################
 #                                                                             #
+#                                    MPI                                      #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(mpi, [AS_HELP_STRING([--disable-mpi],
+                              [Disable StarPU MPI library generation])],
+            [enable_mpi=$enableval],
+            [enable_mpi=yes])
+
+AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
+                              [Enable StarPU to run with the master-slave mode])],
+            use_mpi_master_slave=$enableval,
+            use_mpi_master_slave=no)
+
+#Check MPICC
+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
+           [Path of the mpicc compiler])],
+   [
+       if test x$withval = xyes; then
+           AC_MSG_ERROR(--with-mpicc must be given a pathname)
+       else
+           mpicc_path=$withval
+       fi
+   ],
+   [
+       if test x$enable_simgrid = xyes ; then
+           DEFAULT_MPICC=smpicc
+       else
+           DEFAULT_MPICC=mpicc
+       fi
+       # nothing was specified: default value is used
+       AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
+   ])
+
+# We test if the MPICC compiler exists
+if test ! -x $mpicc_path; then
+    #MPICC does not exists or is not executable
+    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
+    use_mpi=no
+else
+    use_mpi=yes
+    if test x$enable_simgrid = xyes ; then
+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
+                    [Path of the smpirun helper])],
+            [
+                if test x$withval = xyes; then
+                    AC_MSG_ERROR(--with-smpirun must be given a pathname)
+                else
+                    smpirun_path=$withval
+                fi
+            ],
+            [
+                # nothing was specified: default value is used
+                AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
+            ])
+
+    fi
+fi
+
+AC_MSG_CHECKING(mpicc path)
+AC_MSG_RESULT($mpicc_path)
+AC_SUBST(MPICC, $mpicc_path)
+
+
+#Check MPICXX/MPIC++
+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
+           [Path of the mpicxx/mpic++ compiler])],
+   [
+       if test x$withval = xyes; then
+           AC_MSG_ERROR(--with-mpicxx must be given a pathname)
+       else
+           mpicxx_path=$withval
+       fi
+   ],
+   [
+       if test x$enable_simgrid = xyes ; then
+           DEFAULT_MPICXX=smpicxx
+       else
+           DEFAULT_MPICXX=mpicxx
+       fi
+       # nothing was specified: default value is used
+       AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+       
+       # try with mpic++ if mpicxx was not found
+       if test x$mpicxx_path = xno ; then
+            DEFAULT_MPICXX=mpic++
+            AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
+       fi
+   ])
+
+# We test if the MPICXX/MPIC++ compiler exists
+if test ! -x $mpicxx_path; then
+    #MPICXX/MPIC++ does not exists or is not executable
+    AC_MSG_RESULT(The mpicxx compiler '$mpicxx_path' does not have the execute permission)
+    use_mpicxx=no
+else
+    use_mpicxx=yes
+fi
+
+AC_MSG_CHECKING(mpicxx/mpic++ path)
+AC_MSG_RESULT($mpicxx_path)
+AC_SUBST(MPICXX, $mpicxx_path)
+
+
+if test x$use_mpi = xyes -a x$enable_mpi = xyes; then
+    cc_or_mpicc=$mpicc_path
+        # For some reason, libtool uses gcc instead of mpicc when linking
+        # libstarpumpi.
+        # On Darwin (and maybe other systems ?) the linker will fail (undefined
+        # references to MPI_*). We manually add the required flags to fix this
+        # issue.
+        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
+else
+    cc_or_mpicc=$CC
+fi
+AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
+
+# If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
+AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
+running_mpi_check=no
+if test $svndir = 1 -o -d "$srcdir/.git" ; then
+    running_mpi_check=yes
+fi
+if test x$enable_mpi_check = xyes ; then
+    running_mpi_check=yes
+fi
+if test x$enable_mpi_check = xno ; then
+    running_mpi_check=no
+fi
+
+
+# Check if mpiexec is available
+AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
+            [Path of mpiexec])],
+    [
+        if test x$withval = xyes; then
+            AC_MSG_ERROR(--with-mpiexec must be given a pathname)
+        else
+            mpiexec_path=$withval
+        fi
+    ],
+    [
+        # nothing was specified: look in the path
+        AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
+    ])
+
+AC_MSG_CHECKING(whether mpiexec is available)
+AC_MSG_RESULT($mpiexec_path)
+
+# We test if MPIEXEC exists
+if test ! -x $mpiexec_path; then
+    #MPIEXEC does not exists or is not executable
+    AC_MSG_RESULT(The mpiexec script is not valid)
+        running_mpi_check=no
+        mpiexec_path=""
+fi
+
+AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
+if test x$use_mpi = xyes ; then
+    AC_MSG_CHECKING(whether MPI tests should be run)
+    AC_MSG_RESULT($running_mpi_check)
+    AC_SUBST(MPIEXEC,$mpiexec_path)
+fi
+
+#We can only build StarPU MPI Library if User wants it and MPI is available
+if test x$use_mpi = xyes -a x$enable_mpi = xyes ; then
+    build_mpi_lib=yes
+else
+    build_mpi_lib=no
+fi
+
+AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
+AC_MSG_RESULT($build_mpi_lib)
+
+AC_SUBST(USE_MPI, $build_mpi_lib)
+AM_CONDITIONAL(USE_MPI, test x$build_mpi_lib = xyes)
+if test x$build_mpi_lib = xyes; then
+	AC_DEFINE(STARPU_USE_MPI,[1],[whether the StarPU MPI library is available])
+else
+	running_mpi_check=no
+fi
+
+AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
+			[Arguments for mpiexec])],
+	[
+		mpiexec_args=$withval
+	])
+AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
+
+AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
+				   [Enable StarPU MPI activity polling method])],
+				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
+if  test x$enable_mpi_progression_hook = xyes; then
+	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
+fi
+
+#We can only build MPI Master Slave if User wants it and MPI is available
+if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
+    build_mpi_master_slave=yes
+else
+    build_mpi_master_slave=no
+fi
+
+if test x$build_mpi_master_slave = xyes; then
+    AC_DEFINE(STARPU_USE_MPI_MASTER_SLAVE, [1], [MPI Master Slave support is enabled])
+    CC=$mpicc_path    
+    CCLD=$mpicc_path      
+    CXX=$mpicxx_path      
+    CXXLD=mpicxx_path    
+fi
+
+AC_MSG_CHECKING(whether the master-slave mode should be enabled)
+AC_MSG_RESULT($build_mpi_master_slave)
+AM_CONDITIONAL([STARPU_USE_MPI_MASTER_SLAVE], [test x$build_mpi_master_slave = xyes])
+
+AC_MSG_CHECKING(maximum number of MPI master-slave devices)
+AC_ARG_ENABLE(maxmpidev, [AS_HELP_STRING([--enable-maxmpidev=<number>],
+			[maximum number of MPI master-slave devices])],
+			nmaxmpidev=$enableval,
+            [
+             if test x$build_mpi_master_slave = xyes; then
+                 nmaxmpidev=4
+             else
+                 nmaxmpidev=0
+             fi
+            ])
+AC_MSG_RESULT($nmaxmpidev)
+AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devices])
+
+
+###############################################################################
+#                                                                             #
 #                  Miscellaneous options for StarPU                           #
 #                                                                             #
 ###############################################################################
@@ -1818,6 +2020,9 @@ if test x$maxnodes = x0 ; then
 			nodes=`expr $nodes + 1`
 		fi
 
+        #nmaxmpidev = 0 if mpi master-slave is disabled
+        nodes=`expr $nodes + $nmaxmpidev`
+
 		# set maxnodes to the next power of 2 greater than nodes
 		maxnodes=1
 		while test "$maxnodes" -lt "$nodes"
@@ -1890,7 +2095,8 @@ if test x$enable_simgrid != xyes; then
 		nmaxsccdev=0
 	fi
 fi
-nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxsccdev + 15 \) / 16 \) `
+#We suppose Master adds nmaxmpidev workers but slaves don't.
+nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxmpidev + $nmaxsccdev + 15 \) / 16 \) `
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
@@ -1936,139 +2142,40 @@ AC_DEFINE_UNQUOTED(STARPU_HISTORYMAXERROR, [$calibration_heuristic], [calibratio
 
 ###############################################################################
 #                                                                             #
-#                                    MPI                                      #
+#                             MP Common settings                              #
 #                                                                             #
 ###############################################################################
 
-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc[=<path to mpicc>]],
-			[Path of the mpicc compiler])],
-	[
-		if test x$withval = xyes; then
-			AC_MSG_ERROR(--with-mpicc must be given a pathname)
-		else
-			mpicc_path=$withval
-		fi
-	],
-	[
-		if test x$enable_simgrid = xyes ; then
-			DEFAULT_MPICC=smpicc
-		else
-			DEFAULT_MPICC=mpicc
-		fi
-		# nothing was specified: default value is used
-		AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
-	])
+AM_CONDITIONAL([STARPU_USE_MP], [test "x$enable_mic" = "xyes" -o "x$build_mpi_master_slave" = "xyes" -o "x$enable_rcce" = "xyes"])
 
-# We test if the MPICC compiler exists
-if test ! -x $mpicc_path; then
-	#MPICC does not exists or is not executable
-	AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
-	use_mpi=no
-else
-	use_mpi=yes
-	if test x$enable_simgrid = xyes ; then
-		AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]],
-					[Path of the smpirun helper])],
-			[
-				if test x$withval = xyes; then
-					AC_MSG_ERROR(--with-smpirun must be given a pathname)
-				else
-					smpirun_path=$withval
-				fi
-			],
-			[
-				# nothing was specified: default value is used
-				AC_PATH_PROG(smpirun_path, smpirun, [no], [$simgrid_dir/bin:$PATH])
-			])
-
-	fi
-fi
-
-AC_MSG_CHECKING(mpicc path)
-AC_MSG_RESULT($mpicc_path)
-AC_SUBST(MPICC, $mpicc_path)
-if test x$use_mpi = xyes; then
-	cc_or_mpicc=$mpicc_path
-        # For some reason, libtool uses gcc instead of mpicc when linking
-        # libstarpumpi.
-        # On Darwin (and maybe other systems ?) the linker will fail (undefined
-        # references to MPI_*). We manually add the required flags to fix this
-        # issue.
-        AC_SUBST(MPICC_LDFLAGS, `$mpicc_path --showme:link`)
-else
-	cc_or_mpicc=$CC
-fi
-AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
-
-# If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
-AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
-running_mpi_check=no
-if test $svndir = 1 -o -d "$srcdir/.git" ; then
-    running_mpi_check=yes
-fi
-if test x$enable_mpi_check = xyes ; then
-    running_mpi_check=yes
-fi
-if test x$enable_mpi_check = xno ; then
-    running_mpi_check=no
-fi
-
-# Check if mpiexec is available
-AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
-			[Path of mpiexec])],
-	[
-		if test x$withval = xyes; then
-			AC_MSG_ERROR(--with-mpiexec must be given a pathname)
-		else
-			mpiexec_path=$withval
-		fi
-	],
-	[
-		# nothing was specified: look in the path
-		AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
-	])
+AC_ARG_ENABLE([export-dynamic], [AS_HELP_STRING([--disable-export-dynamic],
+			  [Prevent the linker from adding all symbols to the dynamic symbol table])], [], [])
 
-AC_MSG_CHECKING(whether mpiexec is available)
-AC_MSG_RESULT($mpiexec_path)
+if test x$enable_mic = xyes -o x$build_mpi_master_slave = xyes -o x$enable_rcce = xyes ; then
+	AC_DEFINE(STARPU_USE_MP, [1], [Message-passing SINKs support
+		  is enabled])
 
-# We test if MPIEXEC exists
-if test ! -x $mpiexec_path; then
-	#MPIEXEC does not exists or is not executable
-	AC_MSG_RESULT(The mpiexec script is not valid)
-        running_mpi_check=no
-        mpiexec_path=""
+	if test x$enable_export_dynamic != xno ; then
+		STARPU_EXPORT_DYNAMIC="-rdynamic"
+	fi
 fi
 
-AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
-if test x$use_mpi = xyes; then
-        AC_MSG_CHECKING(whether MPI tests should be run)
-        AC_MSG_RESULT($running_mpi_check)
-	AC_SUBST(MPIEXEC,$mpiexec_path)
-fi
+AC_SUBST(STARPU_EXPORT_DYNAMIC)
 
-AC_MSG_CHECKING(whether the StarPU MPI library should be generated)
-AC_MSG_RESULT($use_mpi)
-AC_SUBST(USE_MPI, $use_mpi)
-AM_CONDITIONAL(USE_MPI, test x$use_mpi = xyes)
-if test x$use_mpi = xyes; then
-	AC_DEFINE(STARPU_USE_MPI,[1],[whether the StarPU MPI library is available])
-else
-	running_mpi_check=no
-fi
+# Computes the maximum number of different kernels a message-passing sink
+# can lookup for and launch.
+AC_MSG_CHECKING(Maximum number of message-passing kernels)
+AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
+	      -enable-maxmpkernels=<number>],
+	      [maximum number of kernels a message-passing sink can lookup
+	      for and execute])],
+	      maxmpkernels=$enableval, maxmpkernels=10)
+AC_MSG_RESULT($maxmpkernels)
+AC_DEFINE_UNQUOTED(STARPU_MAXMPKERNELS, [$maxmpkernels],
+		[maximum number of message-passing kernels])
 
-AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
-			[Arguments for mpiexec])],
-	[
-		mpiexec_args=$withval
-	])
-AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
+###############################################################################
 
-AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
-				   [Enable StarPU MPI activity polling method])],
-				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
-if  test x$enable_mpi_progression_hook = xyes; then
-	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
-fi
 
 ###############################################################################
 #                                                                             #
@@ -2233,7 +2340,7 @@ if test "x$FC" != "x"; then
 	fi
 	if test "x$enable_build_fortran" = "xyes" ; then
 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
-		if test x$use_mpi = xyes; then
+		if test x$build_mpi_lib = xyes -o x$build_mpi_master_slave = xyes ; then
 			AC_ARG_WITH(mpifort, [AS_HELP_STRING([--with-mpifort[=<path to mpifort>]],
 				    [Path of the mpifort compiler])],
 				    [
@@ -2286,6 +2393,20 @@ if test "x$enable_build_fortran" = "xyes" ; then
    fi
 fi
 
+#We have MPI C/C++ compiler
+if test x$build_mpi_master_slave = xyes; then
+    #Check if we can compile fortran cases
+    if test x$use_mpi_fort = xyes ; then
+        F77LD=$mpifort_path    
+        FCLD=$mpifort_path
+        F77=$mpifort_path    
+        FC=$mpifort_path
+    else
+        enable_build_fortran=no
+    fi
+fi
+
+
 AM_CONDITIONAL([STARPU_HAVE_FC], [test "x$FC" != "x" -a "x$enable_build_fortran" = "xyes"])
 AM_CONDITIONAL([STARPU_HAVE_F77], [test "x$F77" != "x" -a "x$enable_build_fortran" = "xyes"])
 AM_CONDITIONAL([STARPU_HAVE_MPIFORT], [test "x$use_mpi_fort" = "xyes"])
@@ -2991,13 +3112,14 @@ AC_MSG_NOTICE([
         (Note these numbers do not represent the number of detected
 	devices, but the maximum number of devices StarPU can manage)
 
-	Maximum number of CPUs:           $maxcpus
-	Maximum number of CUDA devices:   $nmaxcudadev
-	Maximum number of OpenCL devices: $nmaxopencldev
-	Maximum number of SCC devices:    $nmaxsccdev
-	Maximum number of MIC threads:    $nmaxmicthreads
-	Maximum number of memory nodes:   $maxnodes
-	Maximum number of task buffers:   $nmaxbuffers
+	Maximum number of CPUs:                     $maxcpus
+	Maximum number of CUDA devices:             $nmaxcudadev
+	Maximum number of OpenCL devices:           $nmaxopencldev
+	Maximum number of SCC devices:              $nmaxsccdev
+	Maximum number of MIC threads:              $nmaxmicthreads
+	Maximum number of MPI master-slave devices: $nmaxmpidev
+	Maximum number of memory nodes:             $maxnodes
+	Maximum number of task buffers:             $nmaxbuffers
 
 	GPU-GPU transfers: $have_cuda_memcpy_peer
 	Allocation cache:  $enable_allocation_cache
@@ -3012,8 +3134,9 @@ AC_MSG_NOTICE([
         Examples:          $enable_build_examples
 
 	StarPU Extensions:
-	       MPI enabled:                                 $use_mpi
+	       StarPU MPI enabled:                          $build_mpi_lib
 	       MPI test suite:                              $running_mpi_check
+	       Master-Slave MPI enabled:                    $use_mpi_master_slave
 	       FFT Support:                                 $fft_support
 	       GCC plug-in:                                 $build_gcc_plugin
 	       GCC plug-in test suite (requires GNU Guile): $run_gcc_plugin_test_suite

+ 38 - 11
doc/doxygen/chapters/320_scheduling.doxy

@@ -8,9 +8,9 @@
 
 /*! \page Scheduling Scheduling
 
-\section TaskSchedulingPolicy Task Scheduling Policy
+\section TaskSchedulingPolicy Task Scheduling Policies
 
-The basics of the scheduling policy are that
+The basics of the scheduling policy are that:
 
 <ul>
 <li>The scheduler gets to schedule tasks (<c>push</c> operation) when they become
@@ -24,19 +24,19 @@ store them between the time when they become available, and the time when a
 worker gets to grab them.
 
 By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
-because it provides correct load balance even if the application codelets do not
-have performance models. If your application codelets have performance models
-(\ref PerformanceModelExample), you should change the scheduler thanks
-to the environment variable \ref STARPU_SCHED. For instance <c>export
-STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
+because it provides correct load balance even if the application codelets do
+not have performance models. Other non-modelling scheduling policies can be
+selected among the list below, thanks to the environment variable \ref
+STARPU_SCHED. For instance <c>export STARPU_SCHED=dmda</c> . Use <c>help</c> to
+get the list of available schedulers.
+
+
+<b>Non Performance Modelling Policies:</b>
 
 The <b>eager</b> scheduler uses a central task queue, from which all workers draw tasks
 to work on concurrently. This however does not permit to prefetch data since the scheduling
 decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
 
-The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
-priority (between -5 and 5).
-
 The <b>random</b> scheduler uses a queue per worker, and distributes tasks randomly according to assumed worker
 overall performance.
 
@@ -50,7 +50,34 @@ a task on the worker which released it by
 default. When a worker becomes idle, it steals a task from neighbour workers. It
 also takes into account priorities.
 
-The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
+The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
+priority specified by the programmer (between -5 and 5).
+
+\section DMTaskSchedulingPolicy Performance Model-Based Task Scheduling Policies
+
+If (<b>and only if</b>) your application <b>codelets have performance models</b> (\ref
+PerformanceModelExample), you should change the scheduler thanks to the
+environment variable \ref STARPU_SCHED, to select one of the policies below, in
+order to take advantage of StarPU's performance modelling. For instance
+<c>export STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available
+schedulers.
+
+<b>Note:</B> Depending on the performance model type chosen, some preliminary
+calibration runs may be needed for the model to converge. If the calibration
+has not been done, or is insufficient yet, or if no performance model is
+specified for a codelet, every task built from this codelet will be scheduled
+using an <b>eager</b> fallback policy.
+
+<b>Troubleshooting:</b> Configuring and recompiling StarPU using the
+<c>--enable-verbose</c> configure flag displays some statistics at the end of
+execution about the percentage of tasks that have been scheduled by a DM*
+family policy using performance model hints. A low or zero percentage may be
+the sign that performance models are not converging or that codelets do not
+have performance models enabled.
+
+<b>Performance Modelling Policies:</b>
+
+The <b>dm</b> (deque model) scheduler takes task execution performance models into account to
 perform a HEFT-similar scheduling strategy: it schedules tasks where their
 termination time will be minimal. The difference with HEFT is that <b>dm</b>
 schedules tasks as soon as they become available, and thus in the order they

+ 5 - 1
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -92,7 +92,11 @@ the environment variable \ref STARPU_FXT_PREFIX.
 
 The additional configure option \ref enable-fxt-lock "--enable-fxt-lock" can
 be used to generate trace events which describes the locks behaviour during
-the execution.
+the execution. It is however very heavy and should not be used unless debugging
+StarPU's internal locking.
+
+The environment variable \ref STARPU_FXT_TRACE can be set to 0 to disable the
+generation of the <c>prof_file_XXX_YYY</c> file.
 
 \subsection CreatingAGanttDiagram Creating a Gantt Diagram
 

+ 7 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -637,6 +637,13 @@ This variable specifies in which file the debugging output should be saved to.
 This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
 </dd>
 
+<dt>STARPU_FXT_TRACE</dt>
+<dd>
+\anchor STARPU_FXT_TRACE
+\addindex __env__STARPU_FXT_TRACE
+This variable specifies whether to generate (1) or not (0) the FxT trace in /tmp/prof_file_XXX_YYY . The default is 1 (generate it)
+</dd>
+
 <dt>STARPU_LIMIT_CUDA_devid_MEM</dt>
 <dd>
 \anchor STARPU_LIMIT_CUDA_devid_MEM

+ 2 - 1
doc/doxygen/chapters/510_configure_options.doxy

@@ -357,7 +357,8 @@ Use the compiler <c>mpicc</c> at <c>path</c>, for StarPU-MPI.
 <dd>
 \anchor enable-mpi-progression-hook
 \addindex __configure__--enable-mpi-progression-hook
-Enable the activity polling method for StarPU-MPI.
+Enable the activity polling method for StarPU-MPI. This is however experimental,
+do not enable it unless you know what you are doing.
 </dd>
 
 <dt>--with-coi-dir</dt>

+ 3 - 0
examples/native_fortran/nf_vector.f90

@@ -57,6 +57,9 @@ program nf_vector
         ! add a CPU implementation function to the codelet
         call fstarpu_codelet_add_cpu_func(cl_vec, C_FUNLOC(cl_cpu_func_vec))
 
+        ! optionally set 'where' field to CPU only
+        call fstarpu_codelet_set_where(cl_vec, FSTARPU_CPU)
+
         ! add a Read-only mode data buffer to the codelet
         call fstarpu_codelet_add_buffer(cl_vec, FSTARPU_R)
 

+ 9 - 9
examples/scheduler/heteroprio_test.c

@@ -25,7 +25,7 @@
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
-void initSchedulerCallback()
+void initSchedulerCallback(unsigned sched_ctx)
 {
 	// CPU uses 3 buckets
 #ifdef STARPU_USE_CPU
@@ -36,28 +36,28 @@ void initSchedulerCallback()
 		unsigned idx;
 		for(idx = 0; idx < 3; ++idx)
 		{
-			starpu_heteroprio_set_mapping(0, STARPU_CPU_IDX, idx, idx);
-			starpu_heteroprio_set_faster_arch(0, STARPU_CPU_IDX, idx);
+			starpu_heteroprio_set_mapping(sched_ctx, STARPU_CPU_IDX, idx, idx);
+			starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_CPU_IDX, idx);
 		}
 	}
 #endif
 #ifdef STARPU_USE_OPENCL
 	// OpenCL is enabled and uses 2 buckets
-	starpu_heteroprio_set_nb_prios(0, STARPU_OPENCL_IDX, 2);
+	starpu_heteroprio_set_nb_prios(sched_ctx, STARPU_OPENCL_IDX, 2);
 	// OpenCL will first look to priority 2
 	int prio2 = starpu_cpu_worker_get_count() ? 2 : 1;
-	starpu_heteroprio_set_mapping(0, STARPU_OPENCL_IDX, 0, prio2);
+	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_IDX, 0, prio2);
 	// For this bucket OpenCL is the fastest
-	starpu_heteroprio_set_faster_arch(0, STARPU_OPENCL_IDX, prio2);
+	starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_OPENCL_IDX, prio2);
 	// And CPU is 4 times slower
 #ifdef STARPU_USE_CPU
-	starpu_heteroprio_set_arch_slow_factor(0, STARPU_CPU_IDX, 2, 4.0f);
+	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_CPU_IDX, 2, 4.0f);
 #endif
 
 	int prio1 = starpu_cpu_worker_get_count() ? 1 : 0;
-	starpu_heteroprio_set_mapping(0, STARPU_OPENCL_IDX, 1, prio1);
+	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_IDX, 1, prio1);
 	// We let the CPU as the fastest and tell that OpenCL is 1.7 times slower
-	starpu_heteroprio_set_arch_slow_factor(0, STARPU_OPENCL_IDX, prio1, 1.7f);
+	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_OPENCL_IDX, prio1, 1.7f);
 #endif
 }
 

+ 1 - 1
examples/spmv/matrix_market/mmio.c

@@ -116,7 +116,7 @@ int mm_read_banner(FILE *f, MM_typecode *matcode)
 	if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
 		return MM_PREMATURE_EOF;
 
-	if (sscanf(line, "%MM_MAX_TOKEN_LENGTHs %MM_MAX_TOKEN_LENGTHs %MM_MAX_TOKEN_LENGTHs %MM_MAX_TOKEN_LENGTHs %MM_MAX_TOKEN_LENGTHs", banner, mtx, crd, data_type, storage_scheme) != 5)
+	if (sscanf(line, "%"MM_MAX_TOKEN_LENGTH_S"s %"MM_MAX_TOKEN_LENGTH_S"s %"MM_MAX_TOKEN_LENGTH_S"s %"MM_MAX_TOKEN_LENGTH_S"s %"MM_MAX_TOKEN_LENGTH_S"s", banner, mtx, crd, data_type, storage_scheme) != 5)
 		return MM_PREMATURE_EOF;
 
 	for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */

+ 55 - 0
include/fstarpu_mod.f90

@@ -74,6 +74,17 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_SCHED_CTX_POLICY_INIT
         type(c_ptr), bind(C) :: FSTARPU_SCHED_CTX_USER_DATA
 
+        type(c_ptr), bind(C) :: FSTARPU_NOWHERE
+        type(c_ptr), bind(C) :: FSTARPU_CPU
+        type(c_ptr), bind(C) :: FSTARPU_CUDA
+        type(c_ptr), bind(C) :: FSTARPU_OPENCL
+        type(c_ptr), bind(C) :: FSTARPU_MIC
+        type(c_ptr), bind(C) :: FSTARPU_SCC
+
+        type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
+        type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
+        type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
+
         ! (some) portable iso_c_binding types
         type(c_ptr), bind(C) :: FSTARPU_SZ_C_DOUBLE
         type(c_ptr), bind(C) :: FSTARPU_SZ_C_FLOAT
@@ -655,12 +666,24 @@ module fstarpu_mod
                         type(c_funptr), value, intent(in) :: f_ptr
                 end subroutine fstarpu_codelet_add_cuda_func
 
+                subroutine fstarpu_codelet_add_cuda_flags (cl, flags) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: cl
+                        type(c_ptr), value, intent(in) :: flags ! C function expects an intptr_t
+                end subroutine fstarpu_codelet_add_cuda_flags
+
                 subroutine fstarpu_codelet_add_opencl_func (cl, f_ptr) bind(C)
                         use iso_c_binding, only: c_ptr, c_funptr
                         type(c_ptr), value, intent(in) :: cl
                         type(c_funptr), value, intent(in) :: f_ptr
                 end subroutine fstarpu_codelet_add_opencl_func
 
+                subroutine fstarpu_codelet_add_opencl_flags (cl, flags) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: cl
+                        type(c_ptr), value, intent(in) :: flags ! C function expects an intptr_t
+                end subroutine fstarpu_codelet_add_opencl_flags
+
                 subroutine fstarpu_codelet_add_mic_func (cl, f_ptr) bind(C)
                         use iso_c_binding, only: c_ptr, c_funptr
                         type(c_ptr), value, intent(in) :: cl
@@ -690,6 +713,18 @@ module fstarpu_mod
                         integer(c_int), value, intent(in) :: nbuffers
                 end subroutine fstarpu_codelet_set_nbuffers
 
+                subroutine fstarpu_codelet_set_flags (cl, flags) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: cl
+                        type(c_ptr), value, intent(in) :: flags ! C function expects an intptr_t
+                end subroutine fstarpu_codelet_set_flags
+
+                subroutine fstarpu_codelet_set_where (cl, where) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: cl
+                        type(c_ptr), value, intent(in) :: where ! C function expects an intptr_t
+                end subroutine fstarpu_codelet_set_where
+
                 ! == starpu_data_interface.h ==
 
                 ! uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags);
@@ -2282,6 +2317,26 @@ module fstarpu_mod
                         FSTARPU_SCHED_CTX_USER_DATA    = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_SCHED_CTX_USER_DATA"//C_NULL_CHAR)
 
+                        FSTARPU_NOWHERE = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_NOWHERE"//C_NULL_CHAR)
+                        FSTARPU_CPU = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_CPU"//C_NULL_CHAR)
+                        FSTARPU_CUDA = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA"//C_NULL_CHAR)
+                        FSTARPU_OPENCL = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_OPENCL"//C_NULL_CHAR)
+                        FSTARPU_MIC = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_MIC"//C_NULL_CHAR)
+                        FSTARPU_SCC = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_SCC"//C_NULL_CHAR)
+
+                        FSTARPU_CODELET_SIMGRID_EXECUTE = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                        FSTARPU_CUDA_ASYNC = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_ASYNC"//C_NULL_CHAR)
+                        FSTARPU_OPENCL_ASYNC = &
+                            fstarpu_get_constant(C_CHAR_"FSTARPU_OPENCL_ASYNC"//C_NULL_CHAR)
+
                         ! Initialize size constants as 'c_ptr'
                         FSTARPU_SZ_C_DOUBLE        = sz_to_p(c_sizeof(FSTARPU_SZ_C_DOUBLE_dummy))
                         FSTARPU_SZ_C_FLOAT        = sz_to_p(c_sizeof(FSTARPU_SZ_C_FLOAT_dummy))

+ 3 - 0
include/starpu_config.h.in

@@ -134,6 +134,7 @@ typedef ssize_t starpu_ssize_t;
 #undef STARPU_HAVE_HWLOC
 #undef STARPU_HAVE_PTHREAD_SPIN_LOCK
 #undef STARPU_HAVE_PTHREAD_BARRIER
+#undef STARPU_HAVE_PTHREAD_SETNAME_NP
 #undef STARPU_HAVE_STRUCT_TIMESPEC
 
 /* This is only for building examples */
@@ -142,4 +143,6 @@ typedef ssize_t starpu_ssize_t;
 /* Enable Fortran to C MPI interface */
 #undef  HAVE_MPI_COMM_F2C
 
+#undef STARPU_HAVE_DARWIN
+
 #endif

+ 1 - 1
include/starpu_sched_ctx.h

@@ -161,7 +161,7 @@ void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_
 
 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
 
-void (*starpu_sched_ctx_get_sched_policy_init(unsigned sched_ctx_id))(void);
+void (*starpu_sched_ctx_get_sched_policy_init(unsigned sched_ctx_id))(unsigned);
 
 unsigned starpu_sched_ctx_has_starpu_scheduler(unsigned sched_ctx_id, unsigned *awake_workers);
 #ifdef STARPU_USE_SC_HYPERVISOR

+ 10 - 0
include/starpu_thread.h

@@ -47,6 +47,7 @@ typedef msg_process_t starpu_pthread_t;
 typedef int starpu_pthread_attr_t;
 
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
+#define starpu_pthread_setname(name)
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
@@ -61,6 +62,15 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 
 #define starpu_pthread_create pthread_create
 #define starpu_pthread_create_on(name, thread, attr, routine, arg, where) starpu_pthread_create(thread, attr, routine, arg)
+#ifdef STARPU_HAVE_PTHREAD_SETNAME_NP
+#ifdef STARPU_HAVE_DARWIN
+#define starpu_pthread_setname(name) pthread_setname_np(name)
+#else
+#define starpu_pthread_setname(name) pthread_setname_np(pthread_self(), name)
+#endif
+#else
+#define starpu_pthread_setname(name)
+#endif
 #define starpu_pthread_join pthread_join
 #define starpu_pthread_exit pthread_exit
 #define starpu_pthread_attr_init pthread_attr_init

+ 2 - 0
mpi/src/starpu_mpi.c

@@ -1280,6 +1280,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 	int rank, worldsize;
 
+	starpu_pthread_setname("MPI");
+
 #ifndef STARPU_SIMGRID
 	_starpu_mpi_do_initialize(argc_argv);
 #endif

+ 5 - 0
src/common/fxt.c

@@ -134,6 +134,9 @@ void _starpu_fxt_init_profiling(unsigned trace_buffer_size)
 {
 	unsigned threadid;
 
+	if (!starpu_get_env_number_default("STARPU_FXT_TRACE", 1))
+		return;
+
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_fxt_started_mutex);
 	if (!_starpu_fxt_started)
 	{
@@ -189,6 +192,8 @@ static void _starpu_generate_paje_trace(char *input_fxt_filename, char *output_p
 
 void _starpu_stop_fxt_profiling(void)
 {
+	if (!_starpu_fxt_started)
+		return;
 	if (!_starpu_written)
 	{
 #ifdef STARPU_VERBOSE

+ 4 - 0
src/common/fxt.h

@@ -924,8 +924,12 @@ do {										\
 #define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)		\
 	FUT_DO_PROBE1(_STARPU_FUT_HANDLE_DATA_REGISTER, handle)
 
+#if 0
 #define _STARPU_TRACE_DATA_INVALIDATE(handle, node)		\
 	FUT_DO_PROBE2(_STARPU_FUT_DATA_INVALIDATE, handle, node)
+#else
+#define _STARPU_TRACE_DATA_INVALIDATE(handle, node)	do {} while (0)
+#endif
 
 #else // !STARPU_USE_FXT
 

+ 209 - 122
src/common/graph.c

@@ -16,140 +16,227 @@
 
 /*
  * This stores the task graph structure, to used by the schedulers which need
- * it.  We do not always enable it since it is costly.
+ * it.  We do not always enable it since it is costly.  To avoid interfering
+ * too much with execution, it may be a bit outdated, i.e. still contain jobs
+ * which have completed very recently.
+ *
+ * This is because we drop nodes lazily: when a job terminates, we just add the
+ * node to the dropped list (to avoid having to take the mutex on the whole
+ * graph).  The graph gets updated whenever the graph mutex becomes available.
  */
 
 #include <starpu.h>
 #include <core/jobs.h>
 #include <common/graph.h>
 
-/* Protects the whole task graph */
+/* Protects the whole task graph except the dropped list */
 static starpu_pthread_rwlock_t graph_lock;
 
 /* Whether we should enable recording the task graph */
 int _starpu_graph_record;
 
-/* This list contains all jobs without incoming dependency */
-struct _starpu_job_list top;
-/* This list contains all jobs without outgoing dependency */
-struct _starpu_job_list bottom;
-/* This list contains all jobs */
-struct _starpu_job_list all;
+/* This list contains all nodes without incoming dependency */
+struct _starpu_graph_node_multilist_top top;
+/* This list contains all nodes without outgoing dependency */
+struct _starpu_graph_node_multilist_bottom bottom;
+/* This list contains all nodes */
+struct _starpu_graph_node_multilist_all all;
+
+/* Protects the dropped list, always taken before graph lock */
+static starpu_pthread_mutex_t dropped_lock;
+/* This list contains all dropped nodes, i.e. the job terminated by the corresponding node is still int he graph */
+struct _starpu_graph_node_multilist_dropped dropped;
 
 void _starpu_graph_init(void)
 {
 	STARPU_PTHREAD_RWLOCK_INIT(&graph_lock, NULL);
-	_starpu_job_list_init(&top);
-	_starpu_job_list_init(&bottom);
-	_starpu_job_list_init(&all);
+	_starpu_graph_node_multilist_init_top(&top);
+	_starpu_graph_node_multilist_init_bottom(&bottom);
+	_starpu_graph_node_multilist_init_all(&all);
+	STARPU_PTHREAD_MUTEX_INIT(&dropped_lock, NULL);
+	_starpu_graph_node_multilist_init_dropped(&dropped);
+}
+
+void _starpu_graph_wrlock(void)
+{
+	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
+}
+
+void _starpu_graph_drop_node(struct _starpu_graph_node *node);
+void _starpu_graph_wrunlock(void)
+{
+	struct _starpu_graph_node *node, *next;
+	struct _starpu_graph_node_multilist_dropped dropping;
+
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	STARPU_PTHREAD_MUTEX_LOCK(&dropped_lock);
+	/* Pick up the list of dropped nodes */
+	_starpu_graph_node_multilist_move_dropped(&dropped, &dropping);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&dropped_lock);
+
+	/* And now process it if it's not empty.  */
+	if (!_starpu_graph_node_multilist_empty_dropped(&dropping))
+	{
+		STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
+		for (node = _starpu_graph_node_multilist_begin_dropped(&dropping);
+		     node != _starpu_graph_node_multilist_end_dropped(&dropping);
+		     node = next)
+		{
+			next = _starpu_graph_node_multilist_next_dropped(node);
+			_starpu_graph_drop_node(node);
+		}
+		STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	}
+}
+
+void _starpu_graph_rdlock(void)
+{
+	STARPU_PTHREAD_RWLOCK_RDLOCK(&graph_lock);
+}
+
+void _starpu_graph_rdunlock(void)
+{
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
 }
 
-static void __starpu_graph_foreach(void (*func)(void *data, struct _starpu_job *job), void *data)
+static void __starpu_graph_foreach(void (*func)(void *data, struct _starpu_graph_node *node), void *data)
 {
-	struct _starpu_job *job;
+	struct _starpu_graph_node *node;
 
-	for (job = _starpu_job_list_begin(&all, all);
-	     job != _starpu_job_list_end(&all, all);
-	     job = _starpu_job_list_next(&all, job, all))
-		func(data, job);
+	for (node = _starpu_graph_node_multilist_begin_all(&all);
+	     node != _starpu_graph_node_multilist_end_all(&all);
+	     node = _starpu_graph_node_multilist_next_all(node))
+		func(data, node);
 }
 
-/* Add a job to the graph */
+/* Add a node to the graph */
 void _starpu_graph_add_job(struct _starpu_job *job)
 {
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
+	struct _starpu_graph_node *node = calloc(1, sizeof(*node));
+	node->job = job;
+	job->graph_node = node;
+	STARPU_PTHREAD_MUTEX_INIT(&node->mutex, NULL);
+
+	_starpu_graph_wrlock();
 
 	/* It does not have any dependency yet, add to all lists */
-	_starpu_job_list_push_back(&top, job, top);
-	_starpu_job_list_push_back(&bottom, job, bottom);
-	_starpu_job_list_push_back(&all, job, all);
+	_starpu_graph_node_multilist_push_back_top(&top, node);
+	_starpu_graph_node_multilist_push_back_bottom(&bottom, node);
+	_starpu_graph_node_multilist_push_back_all(&all, node);
 
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	_starpu_graph_wrunlock();
 }
 
-/* Add a job to an array of jobs */
-static unsigned add_job(struct _starpu_job *job, struct _starpu_job ***jobs, unsigned *n_jobs, unsigned *alloc_jobs, unsigned **slot)
+/* Add a node to an array of nodes */
+static unsigned add_node(struct _starpu_graph_node *node, struct _starpu_graph_node ***nodes, unsigned *n_nodes, unsigned *alloc_nodes, unsigned **slot)
 {
 	unsigned ret;
-	if (*n_jobs == *alloc_jobs)
+	if (*n_nodes == *alloc_nodes)
 	{
-		if (*alloc_jobs)
-			*alloc_jobs *= 2;
+		if (*alloc_nodes)
+			*alloc_nodes *= 2;
 		else
-			*alloc_jobs = 4;
-		*jobs = realloc(*jobs, *alloc_jobs * sizeof(**jobs));
+			*alloc_nodes = 4;
+		*nodes = realloc(*nodes, *alloc_nodes * sizeof(**nodes));
 		if (slot)
-			*slot = realloc(*slot, *alloc_jobs * sizeof(**slot));
+			*slot = realloc(*slot, *alloc_nodes * sizeof(**slot));
 	}
-	ret = (*n_jobs)++;
-	(*jobs)[ret] = job;
+	ret = (*n_nodes)++;
+	(*nodes)[ret] = node;
 	return ret;
 }
 
-/* Add a dependency between jobs */
+/* Add a dependency between nodes */
 void _starpu_graph_add_job_dep(struct _starpu_job *job, struct _starpu_job *prev_job)
 {
 	unsigned rank_incoming, rank_outgoing;
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
+	_starpu_graph_wrlock();
+	struct _starpu_graph_node *node = job->graph_node;
+	struct _starpu_graph_node *prev_node = prev_job->graph_node;
 
-	if (_starpu_job_list_queued(prev_job, bottom))
-		/* Previous job is not at bottom any more */
-		_starpu_job_list_erase(bottom, prev_job, bottom);
+	if (_starpu_graph_node_multilist_queued_bottom(prev_node))
+		/* Previous node is not at bottom any more */
+		_starpu_graph_node_multilist_erase_bottom(&bottom, prev_node);
 
-	if (_starpu_job_list_queued(job, top))
-		/* Next job is not at top any more */
-		_starpu_job_list_erase(top, job, top);
+	if (_starpu_graph_node_multilist_queued_top(node))
+		/* Next node is not at top any more */
+		_starpu_graph_node_multilist_erase_top(&top, node);
 
-	rank_incoming = add_job(prev_job, &job->incoming, &job->n_incoming, &job->alloc_incoming, NULL);
-	rank_outgoing = add_job(job, &prev_job->outgoing, &prev_job->n_outgoing, &prev_job->alloc_outgoing, &prev_job->outgoing_slot);
-	prev_job->outgoing_slot[rank_outgoing] = rank_incoming;
+	rank_incoming = add_node(prev_node, &node->incoming, &node->n_incoming, &node->alloc_incoming, NULL);
+	rank_outgoing = add_node(node, &prev_node->outgoing, &prev_node->n_outgoing, &prev_node->alloc_outgoing, &prev_node->outgoing_slot);
+	prev_node->outgoing_slot[rank_outgoing] = rank_incoming;
 
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	_starpu_graph_wrunlock();
 }
 
-/* Drop a job, and thus its dependencies */
-void _starpu_graph_drop_job(struct _starpu_job *job)
+/* Drop a node, and thus its dependencies */
+void _starpu_graph_drop_node(struct _starpu_graph_node *node)
 {
 	unsigned i;
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
+	STARPU_ASSERT(!node->job);
 
-	if (_starpu_job_list_queued(job, bottom))
-		_starpu_job_list_erase(bottom, job, bottom);
-	if (_starpu_job_list_queued(job, top))
-		_starpu_job_list_erase(top, job, top);
-	if (_starpu_job_list_queued(job, all))
-		_starpu_job_list_erase(all, job, all);
+	if (_starpu_graph_node_multilist_queued_bottom(node))
+		_starpu_graph_node_multilist_erase_bottom(&bottom, node);
+	if (_starpu_graph_node_multilist_queued_top(node))
+		_starpu_graph_node_multilist_erase_top(&top, node);
+	if (_starpu_graph_node_multilist_queued_all(node))
+		_starpu_graph_node_multilist_erase_all(&all, node);
 
-	/* Drop ourself from the incoming part of the outgoing jobs */
-	for (i = 0; i < job->n_outgoing; i++)
+	/* Drop ourself from the incoming part of the outgoing nodes */
+	for (i = 0; i < node->n_outgoing; i++)
 	{
-		struct _starpu_job *next = job->outgoing[i];
-		next->incoming[job->outgoing_slot[i]] = NULL;
+		struct _starpu_graph_node *next = node->outgoing[i];
+		next->incoming[node->outgoing_slot[i]] = NULL;
 	}
-	job->n_outgoing = 0;
-	free(job->outgoing);
-	job->outgoing = NULL;
-	free(job->outgoing_slot);
-	job->outgoing_slot = NULL;
-	job->alloc_outgoing = 0;
-	job->n_incoming = 0;
-	free(job->incoming);
-	job->incoming = NULL;
-	job->alloc_incoming = 0;
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+
+	node->n_outgoing = 0;
+	free(node->outgoing);
+	node->outgoing = NULL;
+	free(node->outgoing_slot);
+	node->outgoing_slot = NULL;
+	node->alloc_outgoing = 0;
+	node->n_incoming = 0;
+	free(node->incoming);
+	node->incoming = NULL;
+	node->alloc_incoming = 0;
+	free(node);
 }
 
-static void _starpu_graph_set_n(void *data, struct _starpu_job *job)
+/* Drop a job */
+void _starpu_graph_drop_job(struct _starpu_job *job)
+{
+	struct _starpu_graph_node *node = job->graph_node;
+	job->graph_node = NULL;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&node->mutex);
+	/* Will not be able to use the job any more */
+	node->job = NULL;
+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->mutex);
+
+	STARPU_PTHREAD_MUTEX_LOCK(&dropped_lock);
+	if (STARPU_PTHREAD_RWLOCK_TRYWRLOCK(&graph_lock) == 0)
+	{
+		/* Graph wrlock is available, drop node immediately */
+		_starpu_graph_drop_node(node);
+		STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	}
+	else
+		/* Queue for removal when lock becomes available */
+		_starpu_graph_node_multilist_push_back_dropped(&dropped, node);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&dropped_lock);
+}
+
+static void _starpu_graph_set_n(void *data, struct _starpu_graph_node *node)
 {
 	int value = (intptr_t) data;
-	job->graph_n = value;
+	node->graph_n = value;
 }
 
 /* Call func for each vertex of the task graph, from bottom to top, in topological order */
-static void _starpu_graph_compute_bottom_up(void (*func)(struct _starpu_job *next_job, struct _starpu_job *prev_job, void *data), void *data)
+static void _starpu_graph_compute_bottom_up(void (*func)(struct _starpu_graph_node *next_node, struct _starpu_graph_node *prev_node, void *data), void *data)
 {
-	struct _starpu_job *job, *job2;
-	struct _starpu_job **current_set = NULL, **next_set = NULL, **swap_set;
+	struct _starpu_graph_node *node, *node2;
+	struct _starpu_graph_node **current_set = NULL, **next_set = NULL, **swap_set;
 	unsigned current_n, next_n, i, j;
 	unsigned current_alloc = 0, next_alloc = 0, swap_alloc;
 
@@ -160,10 +247,10 @@ static void _starpu_graph_compute_bottom_up(void (*func)(struct _starpu_job *nex
 
 	/* Start with the bottom of the graph */
 	current_n = 0;
-	for (job = _starpu_job_list_begin(&bottom, bottom);
-	     job != _starpu_job_list_end(&bottom, bottom);
-	     job = _starpu_job_list_next(&bottom, job, bottom))
-		add_job(job, &current_set, &current_n, &current_alloc, NULL);
+	for (node = _starpu_graph_node_multilist_begin_bottom(&bottom);
+	     node != _starpu_graph_node_multilist_end_bottom(&bottom);
+	     node = _starpu_graph_node_multilist_next_bottom(node))
+		add_node(node, &current_set, &current_n, &current_alloc, NULL);
 
 	/* Now propagate to top as long as we have current nodes */
 	while (current_n)
@@ -174,19 +261,19 @@ static void _starpu_graph_compute_bottom_up(void (*func)(struct _starpu_job *nex
 		/* For each node in the current set */
 		for (i = 0; i < current_n; i++)
 		{
-			job = current_set[i];
-			/* For each parent of this job */
-			for (j = 0; j < job->n_incoming; j++)
+			node = current_set[i];
+			/* For each parent of this node */
+			for (j = 0; j < node->n_incoming; j++)
 			{
-				job2 = job->incoming[j];
-				if (!job2)
+				node2 = node->incoming[j];
+				if (!node2)
 					continue;
-				job2->graph_n++;
-				func(job, job2, data);
+				node2->graph_n++;
+				func(node, node2, data);
 
-				if ((unsigned) job2->graph_n == job2->n_outgoing)
+				if ((unsigned) node2->graph_n == node2->n_outgoing)
 					/* All outgoing edges were processed, can now add to next set */
-					add_job(job2, &next_set, &next_n, &next_alloc, NULL);
+					add_node(node2, &next_set, &next_n, &next_alloc, NULL);
 			}
 		}
 
@@ -203,38 +290,38 @@ static void _starpu_graph_compute_bottom_up(void (*func)(struct _starpu_job *nex
 	free(next_set);
 }
 
-static void compute_depth(struct _starpu_job *next_job, struct _starpu_job *prev_job, void *data STARPU_ATTRIBUTE_UNUSED)
+static void compute_depth(struct _starpu_graph_node *next_node, struct _starpu_graph_node *prev_node, void *data STARPU_ATTRIBUTE_UNUSED)
 {
-	if (prev_job->depth < next_job->depth + 1)
-		prev_job->depth = next_job->depth + 1;
+	if (prev_node->depth < next_node->depth + 1)
+		prev_node->depth = next_node->depth + 1;
 }
 
 void _starpu_graph_compute_depths(void)
 {
-	struct _starpu_job *job;
+	struct _starpu_graph_node *node;
 
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
+	_starpu_graph_wrlock();
 
 	/* The bottom of the graph has depth 0 */
-	for (job = _starpu_job_list_begin(&bottom, bottom);
-	     job != _starpu_job_list_end(&bottom, bottom);
-	     job = _starpu_job_list_next(&bottom, job, bottom))
-		job->depth = 0;
+	for (node = _starpu_graph_node_multilist_begin_bottom(&bottom);
+	     node != _starpu_graph_node_multilist_end_bottom(&bottom);
+	     node = _starpu_graph_node_multilist_next_bottom(node))
+		node->depth = 0;
 
 	_starpu_graph_compute_bottom_up(compute_depth, NULL);
 
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	_starpu_graph_wrunlock();
 }
 
 void _starpu_graph_compute_descendants(void)
 {
-	struct _starpu_job *job, *job2, *job3;
-	struct _starpu_job **current_set = NULL, **next_set = NULL, **swap_set;
+	struct _starpu_graph_node *node, *node2, *node3;
+	struct _starpu_graph_node **current_set = NULL, **next_set = NULL, **swap_set;
 	unsigned current_n, next_n, i, j;
 	unsigned current_alloc = 0, next_alloc = 0, swap_alloc;
 	unsigned descendants;
 
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
+	_starpu_graph_wrlock();
 
 	/* Yes, this is O(|V|.(|V|+|E|)) :( */
 
@@ -243,20 +330,20 @@ void _starpu_graph_compute_descendants(void)
 	 * |E| is usually O(|V|), though (bounded number of data dependencies,
 	 * and we use synchronization tasks) */
 
-	for (job = _starpu_job_list_begin(&all, all);
-	     job != _starpu_job_list_end(&all, all);
-	     job = _starpu_job_list_next(&all, job, all))
+	for (node = _starpu_graph_node_multilist_begin_all(&all);
+	     node != _starpu_graph_node_multilist_end_all(&all);
+	     node = _starpu_graph_node_multilist_next_all(node))
 	{
 		/* Mark all nodes as unseen */
-		for (job2 = _starpu_job_list_begin(&all, all);
-		     job2 != _starpu_job_list_end(&all, all);
-		     job2 = _starpu_job_list_next(&all, job2, all))
-			job2->graph_n = 0;
+		for (node2 = _starpu_graph_node_multilist_begin_all(&all);
+		     node2 != _starpu_graph_node_multilist_end_all(&all);
+		     node2 = _starpu_graph_node_multilist_next_all(node2))
+			node2->graph_n = 0;
 
 		/* Start with the node we want to compute the number of descendants of */
 		current_n = 0;
-		add_job(job, &current_set, &current_n, &current_alloc, NULL);
-		job->graph_n = 1;
+		add_node(node, &current_set, &current_n, &current_alloc, NULL);
+		node->graph_n = 1;
 
 		descendants = 0;
 		/* While we have descendants, count their descendants */
@@ -267,20 +354,20 @@ void _starpu_graph_compute_descendants(void)
 			/* For each node in the current set */
 			for (i = 0; i < current_n; i++)
 			{
-				job2 = current_set[i];
-				/* For each child of this job2 */
-				for (j = 0; j < job2->n_outgoing; j++)
+				node2 = current_set[i];
+				/* For each child of this node2 */
+				for (j = 0; j < node2->n_outgoing; j++)
 				{
-					job3 = job2->outgoing[j];
-					if (!job3)
+					node3 = node2->outgoing[j];
+					if (!node3)
 						continue;
-					if (job3->graph_n)
+					if (node3->graph_n)
 						/* Already seen */
 						continue;
 					/* Add this node */
-					job3->graph_n = 1;
+					node3->graph_n = 1;
 					descendants++;
-					add_job(job3, &next_set, &next_n, &next_alloc, NULL);
+					add_node(node3, &next_set, &next_n, &next_alloc, NULL);
 				}
 			}
 			/* Swap next set with current set */
@@ -292,18 +379,18 @@ void _starpu_graph_compute_descendants(void)
 			current_alloc = swap_alloc;
 			current_n = next_n;
 		}
-		job->descendants = descendants;
+		node->descendants = descendants;
 	}
 
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	_starpu_graph_wrunlock();
 
 	free(current_set);
 	free(next_set);
 }
 
-void _starpu_graph_foreach(void (*func)(void *data, struct _starpu_job *job), void *data)
+void _starpu_graph_foreach(void (*func)(void *data, struct _starpu_graph_node *node), void *data)
 {
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
+	_starpu_graph_wrlock();
 	__starpu_graph_foreach(func, data);
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	_starpu_graph_wrunlock();
 }

+ 56 - 2
src/common/graph.h

@@ -14,8 +14,60 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-void _starpu_graph_init(void);
+#ifndef __GRAPH_H__
+#define __GRAPH_H__
+
+#include <common/list.h>
+MULTILIST_CREATE_TYPE(_starpu_graph_node, all)
+MULTILIST_CREATE_TYPE(_starpu_graph_node, top)
+MULTILIST_CREATE_TYPE(_starpu_graph_node, bottom)
+MULTILIST_CREATE_TYPE(_starpu_graph_node, dropped)
+
+struct _starpu_graph_node {
+	starpu_pthread_mutex_t mutex;	/* protects access to the job */
+	struct _starpu_job *job;	/* pointer to the job, if it is still alive, NULL otherwise */
+
+	/*
+	 * Fields for graph analysis for scheduling heuristics
+	 */
+	/* Member of list of all jobs without incoming dependency */
+	struct _starpu_graph_node_multilist_top top;
+	/* Member of list of all jobs without outgoing dependency */
+	struct _starpu_graph_node_multilist_bottom bottom;
+	/* Member of list of all jobs */
+	struct _starpu_graph_node_multilist_all all;
+	/* Member of list of dropped jobs */
+	struct _starpu_graph_node_multilist_dropped dropped;
+
+	/* set of incoming dependencies */
+	struct _starpu_graph_node **incoming;	/* May contain NULLs for terminated jobs */
+	unsigned n_incoming;		/* Number of slots used */
+	unsigned alloc_incoming;	/* Size of incoming */
+	/* set of outgoing dependencies */
+	struct _starpu_graph_node **outgoing;
+	unsigned *outgoing_slot;	/* Index within corresponding incoming array */
+	unsigned n_outgoing;		/* Number of slots used */
+	unsigned alloc_outgoing;	/* Size of outgoing */
+
+	unsigned depth;			/* Rank from bottom, in number of jobs */
+					/* Only available if _starpu_graph_compute_depths was called */
+	unsigned descendants;		/* Number of children, grand-children, etc. */
+					/* Only available if _starpu_graph_compute_descendants was called */
+
+	int graph_n;			/* Variable available for graph flow */
+};
+
+MULTILIST_CREATE_INLINES(struct _starpu_graph_node, _starpu_graph_node, all)
+MULTILIST_CREATE_INLINES(struct _starpu_graph_node, _starpu_graph_node, top)
+MULTILIST_CREATE_INLINES(struct _starpu_graph_node, _starpu_graph_node, bottom)
+MULTILIST_CREATE_INLINES(struct _starpu_graph_node, _starpu_graph_node, dropped)
+
 extern int _starpu_graph_record;
+void _starpu_graph_init(void);
+void _starpu_graph_wrlock(void);
+void _starpu_graph_rdlock(void);
+void _starpu_graph_wrunlock(void);
+void _starpu_graph_rdunlock(void);
 
 /* Add a job to the graph, called before any _starpu_graph_add_job_dep call */
 void _starpu_graph_add_job(struct _starpu_job *job);
@@ -35,4 +87,6 @@ void _starpu_graph_compute_descendants(void);
 
 /* This calls \e func for each node of the task graph, passing also \e data as it */
 /* Apply func on each job of the graph */
-void _starpu_graph_foreach(void (*func)(void *data, struct _starpu_job *job), void *data);
+void _starpu_graph_foreach(void (*func)(void *data, struct _starpu_graph_node *node), void *data);
+
+#endif /* __GRAPH_H__ */

+ 109 - 0
src/common/list.h

@@ -15,6 +15,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#ifndef __LIST_H__
+#define __LIST_H__
+
 /** @file
  * @brief Listes doublement chainées automatiques
  */
@@ -193,3 +196,109 @@
     { if ((i->_next == NULL) && i != l->_tail) return 0; \
       if (i->_next == i) return 0; \
       i=i->_next;} return 1; }
+
+
+#ifdef STARPU_DEBUG
+#define STARPU_ASSERT_MULTILIST(expr) STARPU_ASSERT(expr)
+#else
+#define STARPU_ASSERT_MULTILIST(expr) ((void) 0)
+#endif
+
+/*
+ * This is an implementation of list allowing to be member of several lists.
+ * - One should first call MULTILIST_CREATE_TYPE for the ENAME and for each
+ *   MEMBER type
+ * - Then the main element type should include fields of type
+ *   ENAME_multilist_MEMBER
+ * - Then one should call MULTILIST_CREATE_INLINES to create the inlines which
+ *   manipulate lists for this MEMBER type.
+ */
+
+/* Create the ENAME_multilist_MEMBER, to be used both as head and as member of main element type */
+#define MULTILIST_CREATE_TYPE(ENAME, MEMBER) \
+struct ENAME##_multilist_##MEMBER { \
+	struct ENAME##_multilist_##MEMBER *next; \
+	struct ENAME##_multilist_##MEMBER *prev; \
+};
+
+/* Create the inlines */
+#define MULTILIST_CREATE_INLINES(TYPE, ENAME, MEMBER) \
+/* Cast from list element to real type.  */ \
+static inline TYPE *ENAME##_of_multilist_##MEMBER(struct ENAME##_multilist_##MEMBER *elt) { \
+	return ((TYPE *) ((uintptr_t) (elt) - ((uintptr_t) (&((TYPE *) 0)->MEMBER)))); \
+} \
+\
+/* Initialize a list head.  */ \
+static inline void ENAME##_multilist_init_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
+	head->next = head; \
+	head->prev = head; \
+} \
+\
+/* Push element to head of a list.  */ \
+static inline void ENAME##_multilist_push_front_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
+	STARPU_ASSERT_MULTILIST(e->MEMBER.prev == NULL); \
+	STARPU_ASSERT_MULTILIST(e->MEMBER.next == NULL); \
+	e->MEMBER.next = head->next; \
+	e->MEMBER.prev = head; \
+	head->next->prev = &e->MEMBER; \
+	head->next = &e->MEMBER; \
+} \
+\
+/* Push element to tail of a list.  */ \
+static inline void ENAME##_multilist_push_back_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
+	STARPU_ASSERT_MULTILIST(e->MEMBER.prev == NULL); \
+	STARPU_ASSERT_MULTILIST(e->MEMBER.next == NULL); \
+	e->MEMBER.prev = head->prev; \
+	e->MEMBER.next = head; \
+	head->prev->next = &e->MEMBER; \
+	head->prev = &e->MEMBER; \
+} \
+\
+/* Erase element from a list.  */ \
+static inline void ENAME##_multilist_erase_##MEMBER(struct ENAME##_multilist_##MEMBER *head STARPU_ATTRIBUTE_UNUSED, TYPE *e) { \
+	STARPU_ASSERT_MULTILIST(e->MEMBER.next->prev == &e->MEMBER); \
+	e->MEMBER.next->prev = e->MEMBER.prev; \
+	STARPU_ASSERT_MULTILIST(e->MEMBER.prev->next == &e->MEMBER); \
+	e->MEMBER.prev->next = e->MEMBER.next; \
+	e->MEMBER.next = NULL; \
+	e->MEMBER.prev = NULL; \
+} \
+\
+/* Test whether the element was queued on the list.  */ \
+static inline int ENAME##_multilist_queued_##MEMBER(TYPE *e) { \
+	return ((e)->MEMBER.next != NULL); \
+} \
+\
+/* Test whether the list is empty.  */ \
+static inline int ENAME##_multilist_empty_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
+	return head->next != head; \
+} \
+\
+/* Return the first element of the list.  */ \
+static inline TYPE *ENAME##_multilist_begin_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
+	return ENAME##_of_multilist_##MEMBER(head->next); \
+} \
+/* Return the value to be tested at the end of the list.  */ \
+static inline TYPE *ENAME##_multilist_end_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
+	return ENAME##_of_multilist_##MEMBER(head); \
+} \
+/* Return the next element of the list.  */ \
+static inline TYPE *ENAME##_multilist_next_##MEMBER(TYPE *e) { \
+	return ENAME##_of_multilist_##MEMBER(e->MEMBER.next); \
+} \
+\
+ /* Move a list from its head to another head.  */ \
+static inline void ENAME##_multilist_move_##MEMBER(struct ENAME##_multilist_##MEMBER *head, struct ENAME##_multilist_##MEMBER *newhead) { \
+	if (ENAME##_multilist_empty_##MEMBER(head)) \
+		ENAME##_multilist_init_##MEMBER(newhead); \
+	else { \
+		newhead->next = head->next; \
+		newhead->next->prev = newhead; \
+		newhead->prev = head->prev; \
+		newhead->prev->next = newhead; \
+		head->next = head; \
+		head->prev = head; \
+	} \
+}
+
+#endif /* __LIST_H__ */

+ 29 - 29
src/core/jobs.c

@@ -38,7 +38,7 @@ static int njobs, maxnjobs;
 
 #ifdef STARPU_DEBUG
 /* List of all jobs, for debugging */
-static struct _starpu_job_list all_jobs_list;
+static struct _starpu_job_multilist_all_submitted all_jobs_list;
 static starpu_pthread_mutex_t all_jobs_list_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 #endif
 
@@ -46,7 +46,7 @@ void _starpu_job_init(void)
 {
 	max_memory_use = starpu_get_env_number_default("STARPU_MAX_MEMORY_USE", 0);
 #ifdef STARPU_DEBUG
-	_starpu_job_list_init(&all_jobs_list);
+	_starpu_job_multilist_init_all_submitted(&all_jobs_list);
 #endif
 }
 
@@ -148,7 +148,7 @@ void _starpu_job_destroy(struct _starpu_job *j)
 		j->dyn_dep_slots = NULL;
 	}
 
-	if (_starpu_graph_record)
+	if (_starpu_graph_record && j->graph_node)
 		_starpu_graph_drop_job(j);
 
 	if (max_memory_use)
@@ -255,7 +255,7 @@ void _starpu_handle_job_submission(struct _starpu_job *j)
 
 #ifdef STARPU_DEBUG
 	STARPU_PTHREAD_MUTEX_LOCK(&all_jobs_list_mutex);
-	_starpu_job_list_push_back(&all_jobs_list, j, all_submitted);
+	_starpu_job_multilist_push_back_all_submitted(&all_jobs_list, j);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&all_jobs_list_mutex);
 #endif
 }
@@ -275,7 +275,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
 #ifdef STARPU_DEBUG
 	STARPU_PTHREAD_MUTEX_LOCK(&all_jobs_list_mutex);
-	_starpu_job_list_erase(&all_jobs_list, j, all_submitted);
+	_starpu_job_multilist_erase_all_submitted(&all_jobs_list, j);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&all_jobs_list_mutex);
 #endif
 
@@ -336,6 +336,30 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		_starpu_release_task_enforce_sequential_consistency(j);
 	}
 
+	/* If the job was executed on a combined worker there is no need for the
+	 * scheduler to process it : the task structure doesn't contain any valuable
+	 * data as it's not linked to an actual worker */
+	/* control task should not execute post_exec_hook */
+	if(j->task_size == 1 && task->cl != NULL && task->cl->where != STARPU_NOWHERE && !j->internal
+#ifdef STARPU_OPENMP
+	/* If this is a continuation, we do not execute the post_exec_hook. The
+	 * post_exec_hook will be run only when the continued task fully
+	 * completes.
+	 *
+	 * Note: If needed, a specific hook could be added to handle stopped
+	 * tasks */
+	&& !continuation
+#endif
+			)
+	{
+		_starpu_sched_post_exec_hook(task);
+#ifdef STARPU_USE_SC_HYPERVISOR
+		int workerid = starpu_worker_get_id();
+		_starpu_sched_ctx_post_exec_task_cb(workerid, task, data_size, j->footprint);
+#endif //STARPU_USE_SC_HYPERVISOR
+
+	}
+
 	/* Remove ourself from the graph before notifying dependencies */
 	if (_starpu_graph_record)
 		_starpu_graph_drop_job(j);
@@ -404,30 +428,6 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		}
 	}
 
-	/* If the job was executed on a combined worker there is no need for the
-	 * scheduler to process it : the task structure doesn't contain any valuable
-	 * data as it's not linked to an actual worker */
-	/* control task should not execute post_exec_hook */
-	if(j->task_size == 1 && task->cl != NULL && task->cl->where != STARPU_NOWHERE && !j->internal
-#ifdef STARPU_OPENMP
-	/* If this is a continuation, we do not execute the post_exec_hook. The
-	 * post_exec_hook will be run only when the continued task fully
-	 * completes.
-	 *
-	 * Note: If needed, a specific hook could be added to handle stopped
-	 * tasks */
-	&& !continuation
-#endif
-			)
-	{
-		_starpu_sched_post_exec_hook(task);
-#ifdef STARPU_USE_SC_HYPERVISOR
-		int workerid = starpu_worker_get_id();
-		_starpu_sched_ctx_post_exec_task_cb(workerid, task, data_size, j->footprint);
-#endif //STARPU_USE_SC_HYPERVISOR
-
-	}
-
 	/* Note: For now, we keep the TASK_DONE trace event for continuation,
 	 * however we could add a specific event for stopped tasks if needed.
 	 */

+ 8 - 90
src/core/jobs.h

@@ -40,6 +40,7 @@
 #include <core/errorcheck.h>
 #include <common/barrier.h>
 #include <common/utils.h>
+#include <common/list.h>
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
@@ -63,71 +64,9 @@ struct _starpu_data_descr
 	int node;
 };
 
-struct _starpu_job_list {
-	struct _starpu_job_list *next;
-	struct _starpu_job_list *prev;
-};
-
 #ifdef STARPU_DEBUG
-#define STARPU_ASSERT_JOB_LIST(expr) STARPU_ASSERT(expr)
-#else
-#define STARPU_ASSERT_JOB_LIST(expr) ((void) 0)
+MULTILIST_CREATE_TYPE(_starpu_job, all_submitted)
 #endif
-
-#define _starpu_job_of(elt, member) \
-	((struct _starpu_job *) ((uintptr_t) (elt) - ((uintptr_t) (&((struct _starpu_job *) 0)->member))))
-
-#define _starpu_job_list_init(head) do { \
-	struct _starpu_job_list *_head = (head); \
-	_head->next = _head; \
-	_head->prev = _head; \
-} while (0)
-
-#define _starpu_job_list_push_front(head, j, member) do { \
-	struct _starpu_job *_j = (j); \
-	struct _starpu_job_list *_head = (head); \
-	STARPU_ASSERT_JOB_LIST(_j->member.prev == NULL); \
-	STARPU_ASSERT_JOB_LIST(_j->member.next == NULL); \
-	_j->member.next = _head->next; \
-	_j->member.prev = _head; \
-	_head->next->prev = &_j->member; \
-	_head->next = &_j->member; \
-} while (0)
-
-#define _starpu_job_list_push_back(head, j, member) do { \
-	struct _starpu_job *_j = (j); \
-	struct _starpu_job_list *_head = (head); \
-	STARPU_ASSERT_JOB_LIST(_j->member.prev == NULL); \
-	STARPU_ASSERT_JOB_LIST(_j->member.next == NULL); \
-	_j->member.prev = _head->prev; \
-	_j->member.next = _head; \
-	_head->prev->next = &_j->member; \
-	_head->prev = &_j->member; \
-} while (0)
-
-#define _starpu_job_list_erase(head, j, member) do { \
-	struct _starpu_job *_j = (j); \
-	STARPU_ASSERT_JOB_LIST(_j->member.next->prev == &_j->member); \
-	_j->member.next->prev = _j->member.prev; \
-	STARPU_ASSERT_JOB_LIST(_j->member.prev->next == &_j->member); \
-	_j->member.prev->next = _j->member.next; \
-	_j->member.next = NULL; \
-	_j->member.prev = NULL; \
-} while (0)
-
-#define _starpu_job_list_queued(j, member) \
-	((j)->member.next != NULL)
-
-#define _starpu_job_list_empty(head) \
-	((head)->next != head)
-
-#define _starpu_job_list_begin(head, member) \
-	_starpu_job_of((head)->next, member)
-#define _starpu_job_list_next(head, j, member) \
-	_starpu_job_of((j)->member.next, member)
-#define _starpu_job_list_end(head, member) \
-	_starpu_job_of(head, member)
-
 /* A job is the internal representation of a task. */
 struct _starpu_job {
 
@@ -252,39 +191,18 @@ struct _starpu_job {
 	starpu_pthread_barrier_t after_work_barrier;
 	unsigned after_work_busy_barrier;
 
-	/*
-	 * Fields for graph analysis for scheduling heuristics
-	 */
-	/* Member of list of all jobs without incoming dependency */
-	struct _starpu_job_list top;
-	/* Member of list of all jobs without outgoing dependency */
-	struct _starpu_job_list bottom;
-	/* Member of list of all jobs */
-	struct _starpu_job_list all;
-
-	/* set of incoming dependencies */
-	struct _starpu_job **incoming;	/* May contain NULLs for terminated jobs */
-	unsigned n_incoming;		/* Number of slots used */
-	unsigned alloc_incoming;	/* Size of incoming */
-	/* set of outgoing dependencies */
-	struct _starpu_job **outgoing;
-	unsigned *outgoing_slot;	/* Index within corresponding incoming array */
-	unsigned n_outgoing;		/* Number of slots used */
-	unsigned alloc_outgoing;	/* Size of outgoing */
-
-	unsigned depth;			/* Rank from bottom, in number of jobs */
-					/* Only available if _starpu_graph_compute_depths was called */
-	unsigned descendants;		/* Number of children, grand-children, etc. */
-					/* Only available if _starpu_graph_compute_descendants was called */
-
-	int graph_n;			/* Variable available for graph flow */
+	struct _starpu_graph_node *graph_node;
 
 #ifdef STARPU_DEBUG
 	/* Linked-list of all jobs, for debugging */
-	struct _starpu_job_list all_submitted;
+	struct _starpu_job_multilist_all_submitted all_submitted;
 #endif
 };
 
+#ifdef STARPU_DEBUG
+MULTILIST_CREATE_INLINES(struct _starpu_job, _starpu_job, all_submitted)
+#endif
+
 void _starpu_job_init(void);
 void _starpu_job_fini(void);
 

+ 2 - 2
src/core/perfmodel/perfmodel_history.c

@@ -1305,7 +1305,7 @@ docal:
 			char archname[32];
 
 			starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-			_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry && entry->history_entry ? entry->history_entry->nsample : 0);
+			_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %lu (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, (unsigned long) size, entry && entry->history_entry ? entry->history_entry->nsample : 0);
 			_starpu_set_calibrate_flag(1);
 			model->benchmarking = 1;
 		}
@@ -1408,7 +1408,7 @@ docal:
 		char archname[32];
 
 		starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), nimpl);
-		_STARPU_DISP("Warning: model %s is not calibrated enough for %s (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, entry ? entry->nsample : 0);
+		_STARPU_DISP("Warning: model %s is not calibrated enough for %s size %ld (only %u measurements), forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol, archname, j->task?_starpu_job_get_data_size(model, arch, nimpl, j):-1, entry ? entry->nsample : 0);
 		_starpu_set_calibrate_flag(1);
 		model->benchmarking = 1;
 	}

+ 1 - 1
src/core/sched_ctx.c

@@ -2577,7 +2577,7 @@ int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id)
 	return -1;
 }
 
-void (*starpu_sched_ctx_get_sched_policy_init(unsigned sched_ctx_id))(void)
+void (*starpu_sched_ctx_get_sched_policy_init(unsigned sched_ctx_id))(unsigned)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	return sched_ctx->init_sched;

+ 1 - 1
src/core/sched_ctx.h

@@ -173,7 +173,7 @@ struct _starpu_sched_ctx
 	unsigned awake_workers;
 
 	/* function called when initializing the scheduler */
-	void (*init_sched)();
+	void (*init_sched)(unsigned);
 };
 
 struct _starpu_machine_config;

+ 1 - 0
src/core/task.c

@@ -1191,6 +1191,7 @@ static void *watchdog_func(void *arg)
 	timeout = ((float) atoll(timeout_env)) / 1000000;
 #endif
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	starpu_pthread_setname("watchdog");
 
 	STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
 	while (_starpu_machine_is_running())

+ 3 - 7
src/core/topology.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016 CNRS
- * Copyright (C) 2011  INRIA
+ * Copyright (C) 2011, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -854,15 +854,11 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 	if (reqmicdevices == -1 && user_conf)
 		reqmicdevices = user_conf->nmic;
 	if (reqmicdevices == -1)
-		reqmicdevices = nhwmicdevices;
-
-	if (reqmicdevices == -1)
-	{
 		/* Nothing was specified, so let's use the number of
 		 * detected mic devices. ! */
 		reqmicdevices = nhwmicdevices;
-	}
-	else
+
+	if (reqmicdevices != -1)
 	{
 		if ((unsigned) reqmicdevices > nhwmicdevices)
 		{

+ 1 - 0
src/drivers/cpu/driver_cpu.c

@@ -217,6 +217,7 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 
 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
+	starpu_pthread_setname(cpu_worker->short_name);
 
 	_STARPU_TRACE_WORKER_INIT_END(cpu_worker->workerid);
 

+ 5 - 0
src/drivers/cuda/driver_cuda.c

@@ -658,6 +658,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
 		_STARPU_TRACE_WORKER_INIT_END(workerid);
 	}
+	{
+		char thread_name[16];
+		snprintf(thread_name, sizeof(thread_name), "CUDA %u", worker0->devid);
+		starpu_pthread_setname(thread_name);
+	}
 
 	/* tell the main thread that this one is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&worker0->mutex);

+ 2 - 3
src/drivers/mic/driver_mic_common.h

@@ -18,13 +18,12 @@
 #ifndef __DRIVER_MIC_COMMON_H__
 #define __DRIVER_MIC_COMMON_H__
 
-
-#include <source/COIProcess_source.h>
 #include <common/config.h>
 
-
 #ifdef STARPU_USE_MIC
 
+#include <source/COIProcess_source.h>
+
 #define STARPU_TO_MIC_ID(id) ((id) + 1)
 
 /* TODO: rather allocate ports on the host and pass them as parameters to the device process */

+ 6 - 0
src/drivers/mic/driver_mic_source.c

@@ -535,6 +535,12 @@ void *_starpu_mic_src_worker(void *arg)
 	{
 		struct _starpu_worker *worker = &config->workers[baseworkerid+i];
 		snprintf(worker->name, sizeof(worker->name), "MIC %d core %u", devid, i);
+		snprintf(worker->short_name, sizeof(worker->short_name), "MIC %d.%u", devid, i);
+	}
+	{
+		char thread_name[16];
+		snprintf(thread_name, sizeof(thread_name), "MIC %d", devid);
+		starpu_pthread_setname(thread_name);
 	}
 
 	for (i = 0; i < worker_set->nworkers; i++)

+ 1 - 0
src/drivers/opencl/driver_opencl.c

@@ -640,6 +640,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 #endif
 	snprintf(worker->name, sizeof(worker->name), "OpenCL %u (%s %.1f GiB)", devid, devname, size);
 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %u", devid);
+	starpu_pthread_setname(worker->short_name);
 
 	worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
 	if (worker->pipeline_length > STARPU_MAX_PIPELINE)

+ 2 - 0
src/drivers/opencl/driver_opencl_utils.c

@@ -169,7 +169,9 @@ char *_starpu_opencl_load_program_source(const char *filename)
 	}
 	source[x] = '\0';
 
+#ifdef STARPU_EXTRA_VERBOSE
 	_STARPU_DEBUG("OpenCL kernel <%s>\n", source);
+#endif
 
 	fclose(fh);
 

+ 7 - 1
src/drivers/scc/driver_scc_source.c

@@ -298,7 +298,13 @@ void *_starpu_scc_src_worker(void *arg)
 	for (i = 0; i < config->topology.nmiccores[devid]; i++)
 	{
 		struct _starpu_worker *worker = &config->workers[baseworkerid+i];
-		snprintf(worker->name, sizeof(worker->name), "MIC %d core %u", devid, i);
+		snprintf(worker->name, sizeof(worker->name), "SCC %d core %u", devid, i);
+		snprintf(worker->short_name, sizeof(worker->short_name), "SCC %d core %u", devid, i);
+	}
+	{
+		char thread_name[16];
+		snprintf(thread_name, sizeof(thread_name), "SCC %d", devid);
+		starpu_pthread_setname(thread_name);
 	}
 
 	_STARPU_TRACE_WORKER_INIT_END(workerid);

+ 26 - 2
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -51,6 +51,7 @@ struct _starpu_dmda_data
 
 	long int total_task_cnt;
 	long int ready_task_cnt;
+	long int eager_task_cnt; /* number of tasks scheduled without model */
 	int num_priorities;
 };
 
@@ -574,6 +575,9 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		best = ntasks_best;
 		model_best = 0.0;
 		transfer_model_best = 0.0;
+#ifdef STARPU_VERBOSE
+		dt->eager_task_cnt++;
+#endif
 	}
 
 	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
@@ -759,6 +763,13 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	*forced_worker = unknown?ntasks_best:-1;
 	*forced_impl = unknown?nimpl_best:-1;
 
+#ifdef STARPU_VERBOSE
+	if (unknown)
+	{
+		dt->eager_task_cnt++;
+	}
+#endif
+
 	*best_exp_endp = best_exp_end;
 	*max_exp_endp = max_exp_end;
 }
@@ -1041,8 +1052,21 @@ static void initialize_dmda_sorted_policy(unsigned sched_ctx_id)
 static void deinitialize_dmda_policy(unsigned sched_ctx_id)
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
-
-	_STARPU_DEBUG("total_task_cnt %ld ready_task_cnt %ld -> %f\n", dt->total_task_cnt, dt->ready_task_cnt, (100.0f*dt->ready_task_cnt)/dt->total_task_cnt);
+#if STARPU_VERBOSE
+	{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	long int modelled_task_cnt = dt->total_task_cnt - dt->eager_task_cnt;
+	_STARPU_DEBUG("%s sched policy (sched_ctx %u): total_task_cnt %ld ready_task_cnt %ld (%.1f%%), modelled_task_cnt = %ld (%.1f%%)%s\n",
+		sched_ctx->sched_policy?sched_ctx->sched_policy->policy_name:"<none>",
+		sched_ctx_id,
+		dt->total_task_cnt,
+		dt->ready_task_cnt,
+		(100.0f*dt->ready_task_cnt)/dt->total_task_cnt,
+		modelled_task_cnt,
+		(100.0f*modelled_task_cnt)/dt->total_task_cnt,
+		modelled_task_cnt==0?" *** Check if performance models are enabled and converging on a per-codelet basis, or use an non-modeling scheduling policy. ***":"");
+	}
+#endif
 
 	free(dt->queue_array);
 	free(dt);

+ 11 - 5
src/sched_policies/graph_test_policy.c

@@ -145,13 +145,19 @@ static struct _starpu_prio_deque *select_prio(unsigned sched_ctx_id, struct _sta
 
 }
 
-static void set_priority(void *_data, struct _starpu_job *job)
+static void set_priority(void *_data, struct _starpu_graph_node *node)
 {
 	struct _starpu_graph_test_policy_data *data = _data;
-	if (data->descendants)
-		job->task->priority = job->descendants;
-	else
-		job->task->priority = job->depth;
+	STARPU_PTHREAD_MUTEX_LOCK(&node->mutex);
+	struct _starpu_job *job = node->job;
+	if (job)
+	{
+		if (data->descendants)
+			job->task->priority = node->descendants;
+		else
+			job->task->priority = node->depth;
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->mutex);
 }
 
 static void do_schedule_graph_test_policy(unsigned sched_ctx_id)

+ 2 - 2
src/sched_policies/heteroprio.c

@@ -213,10 +213,10 @@ static void initialize_heteroprio_policy(unsigned sched_ctx_id)
 	for(idx_prio = 0; idx_prio < STARPU_HETEROPRIO_MAX_PRIO; ++idx_prio)
 		_heteroprio_bucket_init(&hp->buckets[idx_prio]);
 
-	void (*init_sched)(void) = starpu_sched_ctx_get_sched_policy_init(sched_ctx_id);
+	void (*init_sched)(unsigned) = starpu_sched_ctx_get_sched_policy_init(sched_ctx_id);
 
 	if(init_sched)
-		init_sched();
+		init_sched(sched_ctx_id);
 	else
 		default_init_sched(sched_ctx_id);
 

+ 2 - 0
src/top/starpu_top_connection.c

@@ -60,6 +60,7 @@ void * message_from_ui(void * p)
 {
 	(void) p;
 	char str[STARPU_TOP_BUFFER_SIZE];
+	starpu_pthread_setname("starpu_top_message_from_ui");
 	while(1)
 	{
 		char * check=fgets (str, STARPU_TOP_BUFFER_SIZE, starpu_top_socket_fd_read);
@@ -85,6 +86,7 @@ static
 void * message_to_ui(void * p)
 {
 	(void) p;
+	starpu_pthread_setname("starpu_top_message_to_ui");
 	while(1)
 	{
 		char* message = _starpu_top_message_remove(_starpu_top_mt);

+ 64 - 0
src/util/fstarpu.c

@@ -77,6 +77,18 @@ static const intptr_t fstarpu_sched_ctx_awake_workers	= STARPU_SCHED_CTX_AWAKE_W
 static const intptr_t fstarpu_sched_ctx_policy_init	= STARPU_SCHED_CTX_POLICY_INIT;
 static const intptr_t fstarpu_sched_ctx_user_data	= STARPU_SCHED_CTX_USER_DATA;
 
+static const intptr_t fstarpu_starpu_nowhere	= STARPU_NOWHERE;
+static const intptr_t fstarpu_starpu_cpu	= STARPU_CPU;
+static const intptr_t fstarpu_starpu_cuda	= STARPU_CUDA;
+static const intptr_t fstarpu_starpu_opencl	= STARPU_OPENCL;
+static const intptr_t fstarpu_starpu_mic	= STARPU_MIC;
+static const intptr_t fstarpu_starpu_scc	= STARPU_SCC;
+
+static const intptr_t fstarpu_starpu_codelet_simgrid_execute	= STARPU_CODELET_SIMGRID_EXECUTE;
+static const intptr_t fstarpu_starpu_cuda_async	= STARPU_CUDA_ASYNC;
+static const intptr_t fstarpu_starpu_opencl_async	= STARPU_OPENCL_ASYNC;
+
+
 intptr_t fstarpu_get_constant(char *s)
 {
 	if	(!strcmp(s, "FSTARPU_R"))	{ return fstarpu_r; }
@@ -133,6 +145,17 @@ intptr_t fstarpu_get_constant(char *s)
 	else if (!strcmp(s, "FSTARPU_SCHED_CTX_POLICY_INIT"))	{ return fstarpu_sched_ctx_policy_init; }
 	else if (!strcmp(s, "FSTARPU_SCHED_CTX_USER_DATA"))	{ return fstarpu_sched_ctx_user_data; }
 
+	else if (!strcmp(s, "FSTARPU_NOWHERE"))	{ return fstarpu_starpu_nowhere; }
+	else if (!strcmp(s, "FSTARPU_CPU"))	{ return fstarpu_starpu_cpu; }
+	else if (!strcmp(s, "FSTARPU_CUDA"))	{ return fstarpu_starpu_cuda; }
+	else if (!strcmp(s, "FSTARPU_OPENCL"))	{ return fstarpu_starpu_opencl; }
+	else if (!strcmp(s, "FSTARPU_MIC"))	{ return fstarpu_starpu_mic; }
+	else if (!strcmp(s, "FSTARPU_SCC"))	{ return fstarpu_starpu_scc; }
+
+	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE"))	{ return fstarpu_starpu_codelet_simgrid_execute; }
+	else if (!strcmp(s, "FSTARPU_CUDA_ASYNC"))	{ return fstarpu_starpu_cuda_async; }
+	else if (!strcmp(s, "FSTARPU_OPENCL_ASYNC"))	{ return fstarpu_starpu_opencl_async; }
+
 	else { _FSTARPU_ERROR("unknown constant"); }
 }
 
@@ -259,6 +282,21 @@ void fstarpu_codelet_add_cuda_func(struct starpu_codelet *cl, void *f_ptr)
 	_FSTARPU_ERROR("fstarpu: too many cuda functions in Fortran codelet");
 }
 
+void fstarpu_codelet_add_cuda_flags(struct starpu_codelet *cl, intptr_t flags)
+{
+	const size_t max_cuda_flags = sizeof(cl->cuda_flags)/sizeof(cl->cuda_flags[0])-1;
+	unsigned i;
+	for (i = 0; i < max_cuda_flags; i++)
+	{
+		if (cl->cuda_flags[i] == NULL)
+		{
+			cl->cuda_flags[i] = (char)flags;
+			return;
+		}
+	}
+	_FSTARPU_ERROR("fstarpu: too many cuda flags in Fortran codelet");
+}
+
 void fstarpu_codelet_add_opencl_func(struct starpu_codelet *cl, void *f_ptr)
 {
 	const size_t max_opencl_funcs = sizeof(cl->opencl_funcs)/sizeof(cl->opencl_funcs[0])-1;
@@ -274,6 +312,21 @@ void fstarpu_codelet_add_opencl_func(struct starpu_codelet *cl, void *f_ptr)
 	_FSTARPU_ERROR("fstarpu: too many opencl functions in Fortran codelet");
 }
 
+void fstarpu_codelet_add_opencl_flags(struct starpu_codelet *cl, intptr_t flags)
+{
+	const size_t max_opencl_flags = sizeof(cl->opencl_flags)/sizeof(cl->opencl_flags[0])-1;
+	unsigned i;
+	for (i = 0; i < max_opencl_flags; i++)
+	{
+		if (cl->opencl_flags[i] == NULL)
+		{
+			cl->opencl_flags[i] = (char)flags;
+			return;
+		}
+	}
+	_FSTARPU_ERROR("fstarpu: too many opencl flags in Fortran codelet");
+}
+
 void fstarpu_codelet_add_mic_func(struct starpu_codelet *cl, void *f_ptr)
 {
 	const size_t max_mic_funcs = sizeof(cl->mic_funcs)/sizeof(cl->mic_funcs[0])-1;
@@ -341,6 +394,17 @@ void fstarpu_codelet_set_nbuffers(struct starpu_codelet *cl, int nbuffers)
 	}
 }
 
+void fstarpu_codelet_set_flags(struct starpu_codelet *cl, intptr_t flags)
+{
+	cl->flags = (int)flags;
+}
+
+void fstarpu_codelet_set_where(struct starpu_codelet *cl, intptr_t where)
+{
+	STARPU_ASSERT(where >= 0);
+	cl->where = (uint32_t)where;
+}
+
 void * fstarpu_variable_get_ptr(void *buffers[], int i)
 {
 	return (void *)STARPU_VECTOR_GET_PTR(buffers[i]);

+ 5 - 0
tests/microbenchs/tasks_size_overhead.c

@@ -120,7 +120,12 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_SIMGRID
+	/* This will get serialized, avoid spending too much time on it. */
+	totcpus = 2;
+#else
 	totcpus = starpu_worker_get_count_by_type(STARPU_CPU_WORKER);
+#endif
 
 	starpu_shutdown();
 

+ 3 - 0
tools/starpu_tasks_rec_complete.c

@@ -62,6 +62,9 @@ int main(int argc, char *argv[]) {
 		}
 	}
 
+#ifdef STARPU_HAVE_SETENV
+	setenv("STARPU_FXT_TRACE", "0", 1);
+#endif
 	if (starpu_init(NULL) != 0)
 	{
 		fprintf(stderr, "StarPU initialization failure\n");