Przeglądaj źródła

merge from trunk

Corentin Salingue 9 lat temu
rodzic
commit
41101d149b
83 zmienionych plików z 1488 dodań i 358 usunięć
  1. 69 53
      configure.ac
  2. 31 0
      doc/doxygen/chapters/330_scheduling_contexts.doxy
  3. 10 0
      doc/doxygen/chapters/501_environment_variables.doxy
  4. 4 4
      doc/doxygen/chapters/api/data_management.doxy
  5. 12 0
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  6. 18 4
      examples/Makefile.am
  7. 4 1
      examples/pipeline/pipeline.c
  8. 75 0
      examples/sched_ctx/axpy_partition_gpu.cu
  9. 137 0
      examples/sched_ctx/axpy_partition_gpu.h
  10. 7 0
      examples/sched_ctx/dummy_sched_with_ctx.c
  11. 253 0
      examples/sched_ctx/gpu_partition.c
  12. 4 0
      examples/scheduler/dummy_sched.c
  13. 21 7
      gcc-plugin/tests/register.c
  14. 3 0
      include/starpu_config.h.in
  15. 1 1
      include/starpu_data.h
  16. 2 2
      include/starpu_sched_component.h
  17. 8 1
      include/starpu_sched_ctx.h
  18. 3 2
      include/starpu_scheduler.h
  19. 2 0
      include/starpu_task.h
  20. 4 0
      include/starpu_thread.h
  21. 7 1
      include/starpu_worker.h
  22. 1 1
      min-dgels/Makefile.in
  23. 3 2
      min-dgels/base/make.inc
  24. 2 2
      mpi/examples/comm/comm.c
  25. 2 0
      mpi/examples/matrix_decomposition/mpi_cholesky.c
  26. 5 3
      mpi/examples/matrix_decomposition/mpi_decomposition_matrix.c
  27. 5 2
      mpi/examples/mpi_lu/plu_example.c
  28. 6 3
      mpi/examples/mpi_lu/plu_implicit_example.c
  29. 6 3
      mpi/examples/mpi_lu/plu_outofcore_example.c
  30. 1 1
      mpi/src/starpu_mpi.c
  31. 5 1
      mpi/tests/mpi_reduction.c
  32. 3 3
      socl/src/cl_createcontextfromtype.c
  33. 2 2
      socl/src/cl_enqueuendrangekernel.c
  34. 6 3
      socl/src/cl_getdeviceids.c
  35. 54 24
      socl/src/init.c
  36. 2 2
      socl/src/init.h
  37. 21 10
      src/common/graph.c
  38. 1 1
      src/common/list.h
  39. 9 3
      src/common/thread.c
  40. 13 0
      src/common/uthash.h
  41. 11 26
      src/common/utils.c
  42. 1 0
      src/core/dependencies/tags.c
  43. 2 2
      src/core/perfmodel/multiple_regression.c
  44. 4 0
      src/core/perfmodel/perfmodel.c
  45. 124 17
      src/core/sched_ctx.c
  46. 14 2
      src/core/sched_ctx.h
  47. 59 8
      src/core/sched_policy.c
  48. 32 17
      src/core/simgrid.c
  49. 21 11
      src/core/task.c
  50. 35 10
      src/core/topology.c
  51. 110 16
      src/core/workers.c
  52. 9 0
      src/core/workers.h
  53. 1 1
      src/datawizard/coherency.c
  54. 7 0
      src/datawizard/filters.c
  55. 30 14
      src/datawizard/interfaces/data_interface.c
  56. 2 1
      src/datawizard/malloc.c
  57. 3 3
      src/datawizard/user_interactions.c
  58. 72 23
      src/drivers/cuda/driver_cuda.c
  59. 1 0
      src/drivers/cuda/driver_cuda.h
  60. 2 3
      src/drivers/gordon/driver_gordon.c
  61. 2 4
      src/sched_policies/component_worker.c
  62. 87 48
      src/sched_policies/deque_modeling_policy_data_aware.c
  63. 2 1
      src/sched_policies/eager_central_policy.c
  64. 2 1
      src/sched_policies/eager_central_priority_policy.c
  65. 2 0
      src/sched_policies/fifo_queues.c
  66. 2 0
      src/sched_policies/fifo_queues.h
  67. 2 1
      src/sched_policies/heteroprio.c
  68. 2 2
      src/sched_policies/parallel_heft.c
  69. 3 1
      src/sched_policies/work_stealing_policy.c
  70. 1 1
      src/util/fstarpu.c
  71. 1 1
      src/util/openmp_runtime_support.h
  72. 1 1
      src/util/openmp_runtime_support_environment.c
  73. 1 0
      tests/datawizard/dsm_stress.c
  74. 1 0
      tests/datawizard/sync_with_data_with_mem_non_blocking_implicit.c
  75. 1 0
      tests/main/driver_api/init_run_deinit.c
  76. 1 0
      tests/main/regenerate.c
  77. 1 0
      tests/main/regenerate_pipeline.c
  78. 1 0
      tests/main/subgraph_repeat.c
  79. 1 0
      tests/main/subgraph_repeat_regenerate.c
  80. 1 0
      tests/main/subgraph_repeat_regenerate_tag.c
  81. 1 0
      tests/main/subgraph_repeat_tag.c
  82. 8 1
      tests/microbenchs/tasks_size_overhead.c
  83. 2 0
      tests/sched_policies/simple_cpu_gpu_sched.c

+ 69 - 53
configure.ac

@@ -1118,7 +1118,8 @@ if test x$enable_simgrid = xyes ; then
 		]
 		]
 	)
 	)
 	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
 	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
-   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data])
+	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
+   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data sg_link_name])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
 	AC_CHECK_FUNCS([SIMIX_process_get_code], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE], [1], [Define to 1 if you have the `SIMIX_process_get_code' function.])])
 	AC_CHECK_FUNCS([SIMIX_process_get_code], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE], [1], [Define to 1 if you have the `SIMIX_process_get_code' function.])])
@@ -1171,46 +1172,6 @@ fi
 
 
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
-#			 Multiple linear regression			      #
-#                                                                             #
-###############################################################################
-AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
-			[Disable multiple linear regression models])],
-			enable_mlr=$enableval, enable_mlr=yes)
-
-AC_MSG_CHECKING(whether multiple linear regression models are disabled)
-if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
-   	AC_MSG_RESULT(no)
-	install_min_dgels=no
-   	STARPU_SEARCH_LIBS(LAPACK,[dgels_],[lapack],use_system_lapack=yes,,)
-	if test x$use_system_lapack = xyes; then
-	        AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
-		LDFLAGS="-llapack $LDFLAGS"
-	else
-		AC_MSG_CHECKING(whether min-dgels is linked)
-		if test x"$DGELS_LIBS" != x; then
-		   	AC_MSG_RESULT(yes)
-        		AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
-			AC_ARG_VAR([DGELS_LIBS], [linker flags for lapack dgels])
-		else
-			AC_MSG_RESULT(no)
-			AC_MSG_CHECKING(min-dgels source)
-			cp -r $srcdir/min-dgels $PWD/
-			AC_MSG_RESULT(yes)
-			DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/*.a -Wl,--end-group"
-			AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
-			AC_ARG_VAR([DGELS_LIBS], [linker flags for lapack dgels])
-			install_min_dgels=yes
-		fi
-	fi
-else
- 	AC_MSG_RESULT(yes)
-	install_min_dgels=no
-fi
-AM_CONDITIONAL(STARPU_USE_MIN_DGELS, test x$install_min_dgels = xyes)
-
-###############################################################################
-#                                                                             #
 #                                 MIC settings                                #
 #                                 MIC settings                                #
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
@@ -2036,8 +1997,12 @@ AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
 
 
 if test x$maxnodes = x0 ; then
 if test x$maxnodes = x0 ; then
 	if test x$enable_simgrid = xyes ; then
 	if test x$enable_simgrid = xyes ; then
-		# We still need the room for the virtual CUDA/OpenCL devices
-		maxnodes=16
+		# We need the room for the virtual CUDA/OpenCL devices
+		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + $nmaxmicdev + 1 + $nmaxmpidev`
+		if test $nodes -gt 32
+		then
+			nodes=32
+		fi
 	else
 	else
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# We have one memory node shared by all CPU workers, one node per GPU
 		# and per MIC device
 		# and per MIC device
@@ -2061,16 +2026,16 @@ if test x$maxnodes = x0 ; then
 			nodes=`expr $nodes + 1`
 			nodes=`expr $nodes + 1`
 		fi
 		fi
 
 
-        #nmaxmpidev = 0 if mpi master-slave is disabled
-        nodes=`expr $nodes + $nmaxmpidev`
-
-		# set maxnodes to the next power of 2 greater than nodes
-		maxnodes=1
-		while test "$maxnodes" -lt "$nodes"
-		do
-			maxnodes=`expr $maxnodes \* 2`
-		done
+		#nmaxmpidev = 0 if mpi master-slave is disabled
+		nodes=`expr $nodes + $nmaxmpidev`
  	fi
  	fi
+
+	# set maxnodes to the next power of 2 greater than nodes
+	maxnodes=1
+	while test "$maxnodes" -lt "$nodes"
+	do
+		maxnodes=`expr $maxnodes \* 2`
+	done
 fi
 fi
 if test $maxnodes -gt 32 ; then
 if test $maxnodes -gt 32 ; then
 	AC_MSG_ERROR([selected number of nodes ($maxnodes) can not be greater than 32])
 	AC_MSG_ERROR([selected number of nodes ($maxnodes) can not be greater than 32])
@@ -2334,6 +2299,11 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 IS_SUPPORTED_CFLAG(-Wall)
 IS_SUPPORTED_CFLAG(-Wall)
 IS_SUPPORTED_CFLAG(-Werror=implicit)
 IS_SUPPORTED_CFLAG(-Werror=implicit)
 IS_SUPPORTED_CFLAG(-Werror=implicit-function-declaration)
 IS_SUPPORTED_CFLAG(-Werror=implicit-function-declaration)
+if test x$enable_perf_debug = xyes; then
+	IS_SUPPORTED_CFLAG(-no-pie)
+	IS_SUPPORTED_CFLAG(-no-PIE)
+	IS_SUPPORTED_CFLAG(-fno-pie)
+fi
 
 
 if test "x$STARPU_DEVEL" != x; then
 if test "x$STARPU_DEVEL" != x; then
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
@@ -2796,6 +2766,52 @@ AC_MSG_CHECKING(which BLAS lib should be used)
 AC_MSG_RESULT($blas_lib)
 AC_MSG_RESULT($blas_lib)
 AC_SUBST(BLAS_LIB,$blas_lib)
 AC_SUBST(BLAS_LIB,$blas_lib)
 
 
+###############################################################################
+#                                                                             #
+#			 Multiple linear regression			      #
+#                                                                             #
+###############################################################################
+AC_ARG_ENABLE(mlr, [AS_HELP_STRING([--disable-mlr],
+			[Disable multiple linear regression models])],
+			enable_mlr=$enableval, enable_mlr=yes)
+
+AC_MSG_CHECKING(whether multiple linear regression models are disabled)
+if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
+   	AC_MSG_RESULT(no)
+	install_min_dgels=no
+	support_mlr=yes
+   	STARPU_SEARCH_LIBS(LAPACK,[dgels_],[lapack],use_system_lapack=yes,,)
+	if test x$use_system_lapack = xyes; then
+	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
+		LDFLAGS="-llapack $LDFLAGS"
+	else
+		if test x$blas_lib = xmkl; then
+		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])			
+		else
+			AC_MSG_CHECKING(whether min-dgels is linked)
+			if test x"$DGELS_LIBS" != x; then
+		   	   	AC_MSG_RESULT(yes)
+        		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
+			   	AC_ARG_VAR([DGELS_LIBS], [linker flags for lapack dgels])
+			else
+				AC_MSG_RESULT(no)
+				AC_MSG_CHECKING(min-dgels source)
+				cp -r $srcdir/min-dgels $PWD/
+				AC_MSG_RESULT(yes)
+				DGELS_LIBS="-Wl,--start-group $STARPU_BUILD_DIR/min-dgels/build/*.a -Wl,--end-group"
+				AC_DEFINE(STARPU_MLR_MODEL, [1], [use user defined library])
+				AC_ARG_VAR([DGELS_LIBS], [linker flags for lapack dgels])
+				install_min_dgels=yes
+			fi
+		fi
+	fi
+else
+ 	AC_MSG_RESULT(yes)
+	install_min_dgels=no
+	support_mlr=no
+fi
+AM_CONDITIONAL(STARPU_USE_MIN_DGELS, test x$install_min_dgels = xyes)
+
 ##########################################
 ##########################################
 # FFT                                    #
 # FFT                                    #
 ##########################################
 ##########################################
@@ -3203,7 +3219,7 @@ AC_MSG_NOTICE([
                ayudame enabled:                               $ayu_msg
                ayudame enabled:                               $ayu_msg
 	       Native fortran support:                        $enable_build_fortran
 	       Native fortran support:                        $enable_build_fortran
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Native MPI fortran support:                    $use_mpi_fort
-	       Support for multiple linear regression models: $install_min_dgels
+	       Support for multiple linear regression models: $support_mlr
 ])
 ])
 
 
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then

+ 31 - 0
doc/doxygen/chapters/330_scheduling_contexts.doxy

@@ -3,6 +3,7 @@
 //  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
 //  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * Copyright (C) 2011, 2012 INRIA
+ * Copyright (C) 2016 Uppsala University
  * See the file version.doxy for copying conditions.
  * See the file version.doxy for copying conditions.
  */
  */
 
 
@@ -96,6 +97,36 @@ int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", STARPU_SCHED_CTX_PO
 /* .... */
 /* .... */
 \endcode
 \endcode
 
 
+\section CreatingAContext Creating A Context To Partition a GPU
+
+The contexts can also be used to group set of SMs of an NVIDIA GPU in order to isolate
+the parallel kernels and allow them to coexecution on a specified partiton of the GPU.
+
+Each context will be mapped to a stream and the user can indicate the number of SMs.
+The context can be added to a larger context already grouping CPU cores. 
+This larger context can use a scheduling policy that assigns tasks to both CPUs and contexts (partitions of the GPU)
+based on performance models adjusted to the number of SMs.
+
+The GPU implementation of the task has to be modified accordingly and receive as a parameter the number of SMs.
+
+\code{.c}
+/* get the available streams (suppose we have nstreams = 2 by specifying them with STARPU_NWORKER_PER_CUDA=2  */
+int nstreams = starpu_worker_get_stream_workerids(gpu_devid, stream_workerids, STARPU_CUDA_WORKER);
+
+int sched_ctx[nstreams];
+sched_ctx[0] = starpu_sched_ctx_create(&stream_workerids[0], 1, "subctx",  STARPU_SCHED_CTX_CUDA_NSMS, 6, 0);
+sched_ctx[1] = starpu_sched_ctx_create(&stream_workerids[1], 1, "subctx",  STARPU_SCHED_CTX_CUDA_NSMS, 7, 0);
+
+int ncpus = 4;
+int workers[ncpus+nstreams];
+workers[ncpus+0] = stream_workerids[0];
+workers[ncpus+1] = stream_workerids[1];
+
+big_sched_ctx = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0); 
+
+starpu_task_submit_to_ctx(task, big_sched_ctx);
+
+\endcode
 
 
 \section ModifyingAContext Modifying A Context
 \section ModifyingAContext Modifying A Context
 
 

+ 10 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -3,6 +3,7 @@
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012, 2016 INRIA
  * Copyright (C) 2011, 2012, 2016 INRIA
+ * Copyright (C) 2016 Uppsala University
  * See the file version.doxy for copying conditions.
  * See the file version.doxy for copying conditions.
  */
  */
 
 
@@ -51,6 +52,15 @@ Specify the number of workers per CUDA device, and thus the number of kernels
 which will be concurrently running on the devices. The default value is 1.
 which will be concurrently running on the devices. The default value is 1.
 </dd>
 </dd>
 
 
+<dt>STARPU_NWORKER_PER_CUDA</dt>
+<dd>
+\anchor STARPU_CUDA_THREAD_PER_WORKER
+\addindex __env__STARPU_CUDA_THREAD_PER_WORKER
+Specify if the cuda driver should provide a thread per stream or a single thread 
+dealing with all the streams. 0 if one thread per stream, 1 otherwise. The default 
+value is 1.
+</dd>
+
 <dt>STARPU_CUDA_PIPELINE</dt>
 <dt>STARPU_CUDA_PIPELINE</dt>
 <dd>
 <dd>
 \anchor STARPU_CUDA_PIPELINE
 \anchor STARPU_CUDA_PIPELINE

+ 4 - 4
doc/doxygen/chapters/api/data_management.doxy

@@ -307,7 +307,7 @@ completion, this function returns 0.
 This macro can be used to acquire data, but not require it to be available on a given node, only enforce R/W dependencies.
 This macro can be used to acquire data, but not require it to be available on a given node, only enforce R/W dependencies.
 This can for instance be used to wait for tasks which produce the data, but without requesting a fetch to the main memory.
 This can for instance be used to wait for tasks which produce the data, but without requesting a fetch to the main memory.
 
 
-\def STARPU_ACQUIRE_ALL_NODES
+\def STARPU_ACQUIRE_NO_NODE_LOCK_ALL
 \ingroup API_Data_Management
 \ingroup API_Data_Management
 This is the same as STARPU_ACQUIRE_NO_NODE, but will lock the data on all nodes, preventing them from being evicted for instance.
 This is the same as STARPU_ACQUIRE_NO_NODE, but will lock the data on all nodes, preventing them from being evicted for instance.
 This is mostly useful inside starpu only.
 This is mostly useful inside starpu only.
@@ -317,7 +317,7 @@ This is mostly useful inside starpu only.
 This is the same as starpu_data_acquire(), except that the data
 This is the same as starpu_data_acquire(), except that the data
 will be available on the given memory node instead of main
 will be available on the given memory node instead of main
 memory.
 memory.
-STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_ALL_NODES can be used instead of an
+STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be used instead of an
 explicit node number.
 explicit node number.
 
 
 \fn int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 \fn int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
@@ -325,7 +325,7 @@ explicit node number.
 This is the same as starpu_data_acquire_cb(), except that the
 This is the same as starpu_data_acquire_cb(), except that the
 data will be available on the given memory node instead of main
 data will be available on the given memory node instead of main
 memory.
 memory.
-STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_ALL_NODES can be used instead of an
+STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be used instead of an
 explicit node number.
 explicit node number.
 
 
 \fn int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 \fn int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
@@ -333,7 +333,7 @@ explicit node number.
 This is the same as starpu_data_acquire_cb_sequential_consistency(), except that the
 This is the same as starpu_data_acquire_cb_sequential_consistency(), except that the
 data will be available on the given memory node instead of main
 data will be available on the given memory node instead of main
 memory.
 memory.
-STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_ALL_NODES can be used instead of an
+STARPU_ACQUIRE_NO_NODE and STARPU_ACQUIRE_NO_NODE_LOCK_ALL can be used instead of an
 explicit node number.
 explicit node number.
 
 
 \def STARPU_DATA_ACQUIRE_CB(handle, mode, code)
 \def STARPU_DATA_ACQUIRE_CB(handle, mode, code)

+ 12 - 0
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -3,6 +3,7 @@
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * Copyright (C) 2011, 2012 INRIA
+ * Copyright (C) 2016 Uppsala University
  * See the file version.doxy for copying conditions.
  * See the file version.doxy for copying conditions.
  */
  */
 
 
@@ -106,6 +107,17 @@ function pointer allowing to initialize the scheduling policy.
 This macro is used when calling starpu_sched_ctx_create() to specify a
 This macro is used when calling starpu_sched_ctx_create() to specify a
 pointer to some user data related to the context being created.
 pointer to some user data related to the context being created.
 
 
+\def STARPU_SCHED_CTX_SUB_CTXS
+\ingroup API_Scheduling_Contexts
+This macro is used when calling starpu_sched_ctx_create() to specify 
+a list of sub contextes of the current context.
+
+\def STARPU_SCHED_CTX_CUDA_NSMS
+\ingroup API_Scheduling_Contexts
+This macro is used when calling starpu_sched_ctx_create() in order
+to create a context on the NVIDIA GPU to specify the number of SMs
+the context should have
+
 \fn unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_ctx_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap)
 \fn unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_ctx_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Create a context indicating an approximate interval of resources
 Create a context indicating an approximate interval of resources

+ 18 - 4
examples/Makefile.am

@@ -5,6 +5,7 @@
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011-2012  INRIA
 # Copyright (C) 2011-2012  INRIA
 # Copyright (C) 2015-2016  Inria
 # Copyright (C) 2015-2016  Inria
+# Copyright (C) 2016  Uppsala University
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -73,7 +74,10 @@ EXTRA_DIST = 					\
 	reductions/dot_product_opencl_kernels.cl	\
 	reductions/dot_product_opencl_kernels.cl	\
 	scheduler/schedulers.sh				\
 	scheduler/schedulers.sh				\
 	scheduler/schedulers_context.sh			\
 	scheduler/schedulers_context.sh			\
-	fortran/Makefile
+	fortran/Makefile				\
+	sched_ctx/axpy_partition_gpu.h				\
+	sched_ctx/axpy_partition_gpu.cu		
+
 
 
 CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
 CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
 
 
@@ -138,7 +142,8 @@ noinst_HEADERS = 				\
 	pi/SobolQRNG/sobol_gpu.h		\
 	pi/SobolQRNG/sobol_gpu.h		\
 	pi/SobolQRNG/sobol_primitives.h         \
 	pi/SobolQRNG/sobol_primitives.h         \
 	reductions/dot_product.h                \
 	reductions/dot_product.h                \
-	basic_examples/vector_scal_cpu_template.h
+	basic_examples/vector_scal_cpu_template.h \
+	sched_ctx/axpy_partition_gpu.h				
 
 
 #####################################
 #####################################
 # What to install and what to check #
 # What to install and what to check #
@@ -229,7 +234,8 @@ STARPU_EXAMPLES +=				\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/dummy_sched_with_ctx		\
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_tree_example  \
 	reductions/dot_product			\
 	reductions/dot_product			\
-	reductions/minmax_reduction
+	reductions/minmax_reduction		\
+	sched_ctx/gpu_partition
 
 
 endif
 endif
 
 
@@ -337,6 +343,14 @@ endif
 
 
 endif !STARPU_SIMGRID
 endif !STARPU_SIMGRID
 
 
+sched_ctx_gpu_partition_SOURCES =		\
+	sched_ctx/gpu_partition.c
+
+if STARPU_USE_CUDA
+sched_ctx_gpu_partition_SOURCES +=		\
+	sched_ctx/axpy_partition_gpu.cu
+endif
+
 ##################
 ##################
 # Basic examples #
 # Basic examples #
 ##################
 ##################
@@ -851,7 +865,7 @@ endif
 
 
 cpp_add_vectors_SOURCES	=	\
 cpp_add_vectors_SOURCES	=	\
 	cpp/add_vectors.cpp
 	cpp/add_vectors.cpp
-	
+
 if STARPU_HAVE_CXX11
 if STARPU_HAVE_CXX11
 cpp_add_vectors_cpp11_SOURCES	=	\
 cpp_add_vectors_cpp11_SOURCES	=	\
 	cpp/add_vectors_cpp11.cpp
 	cpp/add_vectors_cpp11.cpp

+ 4 - 1
examples/pipeline/pipeline.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2012, 2013, 2014  CNRS
  * Copyright (C) 2012, 2013, 2014  CNRS
- * Copyright (C) 2012, 2014  Université de Bordeaux
+ * Copyright (C) 2012, 2014, 2016  Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -202,7 +202,10 @@ int main(void)
 		float y = 2*l;
 		float y = 2*l;
 		/* First wait for the C previous concurrent stages */
 		/* First wait for the C previous concurrent stages */
 		if (l >= C)
 		if (l >= C)
+		{
+			starpu_do_schedule();
 			sem_wait(&sems[l%C]);
 			sem_wait(&sems[l%C]);
+		}
 
 
 		/* Now submit the next stage */
 		/* Now submit the next stage */
 		ret = starpu_task_insert(&pipeline_codelet_x,
 		ret = starpu_task_insert(&pipeline_codelet_x,

+ 75 - 0
examples/sched_ctx/axpy_partition_gpu.cu

@@ -0,0 +1,75 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016  Uppsala University
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This creates two dumb vectors, splits them into chunks, and for each pair of
+ * chunk, run axpy on them.
+ */
+
+#include <starpu.h>
+#include "axpy_partition_gpu.h"
+#include <stdio.h>
+
+//This code demonstrates how to transform a kernel to execute on a given set of GPU SMs.
+
+
+// Original kernel
+__global__ void saxpy(int n, float a, float *x, float *y)
+{
+	int i = blockIdx.x*blockDim.x + threadIdx.x;
+	if (i<n)  y[i] = a*x[i] + y[i];
+}
+
+
+
+
+// Transformed kernel
+__global__ void saxpy_partitioned(__P_KARGS, int n, float a, float *x, float *y)
+{
+  __P_BEGIN;
+  __P_LOOPX;
+        int i = blockid.x*blockDim.x + threadIdx.x; // note that blockIdx is replaced.
+	if (i<n)  y[i] = a*x[i] + y[i];
+  __P_LOOPEND;
+}
+      
+
+extern "C" void cuda_axpy(void *descr[], void *_args)
+{
+	 float a = *((float *)_args);
+
+        unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+        float *x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+        float *y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int SM_mapping_start = -1;
+	int SM_mapping_end = -1; 
+  	int SM_allocation = -1;
+  
+	cudaStream_t stream = starpu_cuda_get_local_stream();
+	int workerid = starpu_worker_get_id();
+    	starpu_sched_ctx_get_sms_interval(workerid, &SM_mapping_start, &SM_mapping_end);
+	SM_allocation = SM_mapping_end - SM_mapping_start;
+	int dimensions = 512;	
+	//partitioning setup
+//	int SM_mapping_start = 0;
+//  	int SM_allocation = 13;
+  
+	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
+
+  	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
+}

+ 137 - 0
examples/sched_ctx/axpy_partition_gpu.h

@@ -0,0 +1,137 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016  Uppsala University
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This creates two dumb vectors, splits them into chunks, and for each pair of
+ * chunk, run axpy on them.
+ */
+
+#pragma once
+
+
+__device__ static uint get_smid(void) {
+#if defined(__CUDACC__)
+  uint ret;
+  asm("mov.u32 %0, %smid;" : "=r"(ret) );
+  return ret;
+#else
+  return 0;
+#endif
+}
+
+
+#define __P_HKARGS    dimGrid,     active_blocks     ,occupancy,               block_assignment_d,   mapping_start
+#define __P_KARGS dim3 blocks, int active_blocks, int occupancy, unsigned int* block_assignment, int mapping_start
+
+#define __P_DARGS blocks,blockid
+
+#define __P_BEGIN							\
+__shared__ unsigned int block_start;					\
+int smid = get_smid();							\
+if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)		\
+  {									\
+    block_start = atomicDec(&block_assignment[smid],0xDEADBEEF);	\
+  }									\
+__syncthreads();							\
+									\
+if(block_start > active_blocks)						\
+  {									\
+    return;								\
+  }									
+
+#define __P_LOOPXY							\
+  dim3 blockid;								\
+  blockid.z = 0;							\
+									\
+  int gridDim_sum = blocks.x*blocks.y;					\
+  int startBlock = block_start + (smid - mapping_start) * occupancy;	\
+									\
+  for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
+    {									\
+  blockid.x = blockid_sum % blocks.x;					\
+  blockid.y = blockid_sum / blocks.x;
+
+#define __P_LOOPEND }
+// Needed if shared memory is used
+#define __P_LOOPEND_SAFE __syncthreads(); }
+
+#define __P_LOOPX							\
+  dim3 blockid;								\
+  blockid.z = 0;							\
+  blockid.y = 0;							\
+  int gridDim_sum = blocks.x;						\
+  int startBlock = (smid-mapping_start) + block_start*(active_blocks/occupancy); \
+									\
+  for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
+    {									\
+  blockid.x = blockid_sum;
+
+
+  //  int startBlock = block_start + (smid - mapping_start) * occupancy; \
+
+
+//////////// HOST side functions
+
+
+template <typename F>
+static void buildPartitionedBlockMapping(F cudaFun, int threads, int shmem, int mapping_start, int allocation,
+				  int &width, int &active_blocks, unsigned int *block_assignment_d,cudaStream_t current_stream =
+#ifdef cudaStreamPerThread
+				  cudaStreamPerThread
+#else
+				  NULL
+#endif
+				  )
+{
+  int occupancy;
+  int nb_SM = 13; //TODO: replace with call
+  int mapping_end = mapping_start + allocation - 1; // exclusive
+  unsigned int block_assignment[15];
+  
+#if CUDART_VERSION >= 6050
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy,cudaFun,threads,shmem);
+#else
+  occupancy = 4;
+#endif
+  width = occupancy * nb_SM; // Physical wrapper grid size. Fits GPU exactly
+  active_blocks = occupancy*allocation; // The total number of blocks doing work
+
+  for(int i = 0; i < mapping_start; i++)
+    block_assignment[i] = (unsigned) -1;
+
+  for(int i = mapping_start; i <= mapping_end; i++)
+    {
+      block_assignment[i] = occupancy - 1;
+    }
+
+  for(int i = mapping_end+1; i < nb_SM; i++)
+    block_assignment[i] = (unsigned) -1;
+
+  cudaMemcpyAsync((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice, current_stream);
+  //cudaMemcpy((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice);
+}
+
+
+
+#define __P_HOSTSETUP(KERNEL,GRIDDIM,BLOCKSIZE,SHMEMSIZE,MAPPING_START,MAPPING_END,STREAM)	\
+  unsigned int* block_assignment_d; cudaMalloc((void**) &block_assignment_d,15*sizeof(unsigned int)); \
+  int width = 0;							\
+  int active_blocks = 0;						\
+  buildPartitionedBlockMapping(KERNEL,BLOCKSIZE,SHMEMSIZE,(MAPPING_START),(MAPPING_END)-(MAPPING_START), \
+			       width, active_blocks, block_assignment_d,STREAM); \
+  int occupancy = active_blocks/((MAPPING_END)-(MAPPING_START));		\
+  dim3 dimGrid = (GRIDDIM);\
+  int mapping_start = (MAPPING_START);

+ 7 - 0
examples/sched_ctx/dummy_sched_with_ctx.c

@@ -15,6 +15,13 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+/*
+ * This is an example of an application-defined scheduler run inside a
+ * scheduling context.
+ * This is a mere eager scheduler with a centralized list of tasks to schedule:
+ * when a task becomes ready (push) it is put on the list. When a device
+ * becomes ready (pop), a task is taken from the list.
+ */
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_scheduler.h>
 #include <starpu_scheduler.h>
 #include <config.h>
 #include <config.h>

+ 253 - 0
examples/sched_ctx/gpu_partition.c

@@ -0,0 +1,253 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016  Uppsala University
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This creates two dumb vectors & run axpy on them.
+ */
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+
+#include <common/blas.h>
+
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#endif
+
+
+#define N	512*512
+#define NITER   100
+
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+#define EPSILON 1e-6
+
+float *_vec_x[NITER], *_vec_y[NITER];
+float _alpha = 3.41;
+
+/* descriptors for StarPU */
+starpu_data_handle_t _handle_y[NITER], _handle_x[NITER];
+
+void axpy_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
+{
+	float alpha = *((float *)arg);
+
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	unsigned i;
+	for( i = 0; i < n; i++)
+		block_y[i] = alpha * block_x[i] + block_y[i];
+}
+
+#ifdef STARPU_USE_CUDA
+extern void cuda_axpy(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
+#endif
+
+static struct starpu_perfmodel axpy_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "axpy"
+};
+
+static struct starpu_codelet axpy_cl =
+{
+	/* .cpu_funcs = {axpy_cpu}, */
+	/* .cpu_funcs_name = {"axpy_cpu"}, */
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {cuda_axpy},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1},
+#endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.name = "axpy",
+	.model = &axpy_model
+};
+
+static int
+check(int niter)
+{
+	int i;
+	for (i = 0; i < N; i++)
+	{
+		float expected_value = _alpha * _vec_x[niter][i] + 4.0;
+		if (fabs(_vec_y[niter][i] - expected_value) > expected_value * EPSILON)
+		{
+			FPRINTF(stderr,"[error for iter %d, indice %d], obtained value %f NOT expected value %f (%f*%f+%f)\n", niter, i, _vec_y[niter][i], expected_value, _alpha, _vec_x[niter][i], 4.0);
+			return EXIT_FAILURE;
+		}
+	}
+
+	return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+	int ret, exit_value = 0;
+	int iter;
+	int ncuda = 0;
+	int gpu_devid = -1;
+
+#warning temporary fix: skip test as cuda computation fails
+	return 77;
+
+#ifndef STARPU_HAVE_SETENV
+	return 77;
+#else
+	/* Have separate threads for streams */
+	setenv("STARPU_CUDA_THREAD_PER_WORKER", "1", 1);
+	setenv("STARPU_NWORKER_PER_CUDA", "2", 1);
+#endif
+
+	/* Initialize StarPU */
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_CUDA
+	ncuda = starpu_worker_get_devids(STARPU_CUDA_WORKER, &gpu_devid, 1);
+	FPRINTF(stderr, "gpu_devid found %d \n", gpu_devid);
+#endif
+	if (ncuda == 0)
+	{
+		starpu_shutdown();
+		return 77;
+	}
+
+	for(iter = 0; iter < NITER; iter++)
+	{
+		/* This is equivalent to
+		   vec_a = malloc(N*sizeof(float));
+		   vec_b = malloc(N*sizeof(float));
+		*/
+		starpu_malloc((void **)&_vec_x[iter], N*sizeof(float));
+		assert(_vec_x[iter]);
+
+		starpu_malloc((void **)&_vec_y[iter], N*sizeof(float));
+		assert(_vec_y[iter]);
+
+		unsigned i;
+		for (i = 0; i < N; i++)
+		{
+			_vec_x[iter][i] = 1.0f; /*(float)starpu_drand48(); */
+			_vec_y[iter][i] = 4.0f; /*(float)starpu_drand48(); */
+		}
+
+		/* Declare the data to StarPU */
+		starpu_vector_data_register(&_handle_x[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_x[iter], N, sizeof(float));
+		starpu_vector_data_register(&_handle_y[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_y[iter], N, sizeof(float));
+	}
+
+	double start;
+	double end;
+#ifdef STARPU_USE_CUDA
+	unsigned nworkers = starpu_worker_get_count();
+	int stream_workerids[nworkers];
+
+	int nstreams = starpu_worker_get_stream_workerids(gpu_devid, stream_workerids, STARPU_CUDA_WORKER);
+
+	int s;
+	for(s = 0; s < nstreams; s++)
+		FPRINTF(stderr, "stream w %d \n", stream_workerids[s]);
+
+	int ncpus = starpu_cpu_worker_get_count();
+	int workers[ncpus+nstreams];
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
+
+	int sched_ctxs[nstreams];
+	int nsms[nstreams];
+	nsms[0] = 6;
+	nsms[1] = 7;
+
+	for(s = 0; s < nstreams; s++)
+	{
+		sched_ctxs[s] = starpu_sched_ctx_create(&stream_workerids[s], 1, "subctx",  STARPU_SCHED_CTX_CUDA_NSMS, nsms[s], 0);
+		workers[ncpus+s] = stream_workerids[s];
+	}
+	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
+
+	FPRINTF(stderr, "parent ctx %d\n", sched_ctx1);
+	starpu_sched_ctx_set_context(&sched_ctx1);
+
+#endif
+	start = starpu_timing_now();
+
+	for (iter = 0; iter < NITER; iter++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &axpy_cl;
+
+		task->cl_arg = &_alpha;
+		task->cl_arg_size = sizeof(_alpha);
+
+		task->handles[0] = _handle_x[iter];
+		task->handles[1] = _handle_y[iter];
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+		{
+			exit_value = 77;
+			goto enodev;
+		}
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_task_wait_for_all();
+
+enodev:
+	for(iter = 0; iter < NITER; iter++)
+	{
+		starpu_data_unregister(_handle_x[iter]);
+		starpu_data_unregister(_handle_y[iter]);
+	}
+	end = starpu_timing_now();
+        double timing = end - start;
+
+	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(float)/timing);
+
+//	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[iter][0], _alpha);
+
+	if (exit_value != 77)
+	{
+		for(iter = 0; iter < NITER; iter++)
+		{
+			exit_value = check(iter);
+			if(exit_value != EXIT_SUCCESS)
+				break;
+		}
+	}
+
+	for(iter = 0; iter < NITER; iter++)
+	{
+		starpu_free((void *)_vec_x[iter]);
+		starpu_free((void *)_vec_y[iter]);
+	}
+
+	/* Stop StarPU */
+	starpu_shutdown();
+
+	return exit_value;
+}

+ 4 - 0
examples/scheduler/dummy_sched.c

@@ -117,6 +117,10 @@ static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
 	 * the calling worker. So we just take the head of the list and give it
 	 * the calling worker. So we just take the head of the list and give it
 	 * to the worker. */
 	 * to the worker. */
 	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+#ifdef STARPU_NON_BLOCKING_DRIVERS
+	if (starpu_task_list_empty(&data->sched_list))
+		return NULL;
+#endif
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
 	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
 	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);

+ 21 - 7
gcc-plugin/tests/register.c

@@ -15,6 +15,7 @@
    along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
    along with GCC-StarPU.  If not, see <http://www.gnu.org/licenses/>.  */
 
 
 /* Test whether `#pragma starpu register ...' generates the right code.  */
 /* Test whether `#pragma starpu register ...' generates the right code.  */
+/* r19465 is modifying the test to avoid calling starpu_data_register twice with the same variable, starpu now checks that the same key is not entered twice in the same hashtable */
 
 
 #undef NDEBUG
 #undef NDEBUG
 
 
@@ -31,6 +32,7 @@ foo (void)
 #pragma starpu register x /* (warning "considered unsafe") */
 #pragma starpu register x /* (warning "considered unsafe") */
 }
 }
 
 
+#if 0
 static void
 static void
 bar (float *p, int s)
 bar (float *p, int s)
 {
 {
@@ -50,6 +52,7 @@ baz (int s, float *p)
   expected_register_arguments.element_size = sizeof *p;
   expected_register_arguments.element_size = sizeof *p;
 #pragma starpu register p s
 #pragma starpu register p s
 }
 }
+#endif
 
 
 /* Check the interaction between `register' and `heap_allocated'.  This test
 /* Check the interaction between `register' and `heap_allocated'.  This test
    assumes `heap_allocated' works as expected.  */
    assumes `heap_allocated' works as expected.  */
@@ -84,6 +87,7 @@ main (int argc, char *argv[])
 
 
   int x[123];
   int x[123];
   double *y;
   double *y;
+  double *yy;
   static char z[345];
   static char z[345];
   static float m[7][42];
   static float m[7][42];
   static float m3d[14][11][80];
   static float m3d[14][11][80];
@@ -91,6 +95,7 @@ main (int argc, char *argv[])
   size_t y_size = 234;
   size_t y_size = 234;
 
 
   y = malloc (234 * sizeof *y);
   y = malloc (234 * sizeof *y);
+  yy = malloc (234 * sizeof *yy);
 
 
   expected_register_arguments.pointer = x;
   expected_register_arguments.pointer = x;
   expected_register_arguments.elements = 123;
   expected_register_arguments.elements = 123;
@@ -102,10 +107,10 @@ main (int argc, char *argv[])
   expected_register_arguments.element_size = sizeof *y;
   expected_register_arguments.element_size = sizeof *y;
 #pragma starpu register y 234
 #pragma starpu register y 234
 
 
-  expected_register_arguments.pointer = y;
+  expected_register_arguments.pointer = yy;
   expected_register_arguments.elements = y_size;
   expected_register_arguments.elements = y_size;
-  expected_register_arguments.element_size = sizeof *y;
-#pragma starpu register y y_size
+  expected_register_arguments.element_size = sizeof *yy;
+#pragma starpu register yy y_size
 
 
   expected_register_arguments.pointer = z;
   expected_register_arguments.pointer = z;
   expected_register_arguments.elements = 345;
   expected_register_arguments.elements = 345;
@@ -122,6 +127,7 @@ main (int argc, char *argv[])
   expected_register_arguments.element_size = sizeof argv[0];
   expected_register_arguments.element_size = sizeof argv[0];
 #pragma starpu register argv 456
 #pragma starpu register argv 456
 
 
+#if 0
 #define ARGV argv
 #define ARGV argv
 #define N 456
 #define N 456
   expected_register_arguments.pointer = argv;
   expected_register_arguments.pointer = argv;
@@ -130,22 +136,25 @@ main (int argc, char *argv[])
 #pragma starpu register   ARGV /* hello, world! */  N
 #pragma starpu register   ARGV /* hello, world! */  N
 #undef ARGV
 #undef ARGV
 #undef N
 #undef N
+#endif
 
 
   foo ();
   foo ();
-  bar ((float *) argv, argc);
-  baz (argc, (float *) argv);
+  //  bar ((float *) argv, argc);
+  //  baz (argc, (float *) argv);
 
 
+#if 0
   expected_register_arguments.pointer = argv;
   expected_register_arguments.pointer = argv;
   expected_register_arguments.elements = argc;
   expected_register_arguments.elements = argc;
   expected_register_arguments.element_size = sizeof argv[0];
   expected_register_arguments.element_size = sizeof argv[0];
 
 
   int chbouib = argc;
   int chbouib = argc;
 #pragma starpu register argv chbouib
 #pragma starpu register argv chbouib
+#endif
 
 
-  expected_register_arguments.pointer = &argv[2];
+  expected_register_arguments.pointer = &argv[1];
   expected_register_arguments.elements = 3;
   expected_register_arguments.elements = 3;
   expected_register_arguments.element_size = sizeof argv[0];
   expected_register_arguments.element_size = sizeof argv[0];
-#pragma starpu register &argv[2] 3
+#pragma starpu register &argv[1] 3
 
 
   expected_register_arguments.pointer = &argv[argc + 3 / 2];
   expected_register_arguments.pointer = &argv[argc + 3 / 2];
   expected_register_arguments.elements = argc * 4;
   expected_register_arguments.elements = argc * 4;
@@ -172,9 +181,14 @@ main (int argc, char *argv[])
   expected_register_arguments.element_size = sizeof m3d[0];
   expected_register_arguments.element_size = sizeof m3d[0];
 #pragma starpu register m3d
 #pragma starpu register m3d
 
 
+#if 0
   assert (data_register_calls == 17);
   assert (data_register_calls == 17);
+#else
+  assert (data_register_calls == 13);
+#endif
 
 
   free (y);
   free (y);
+  free (yy);
 
 
   heap_alloc (42, 77);
   heap_alloc (42, 77);
   assert (free_calls == 1);
   assert (free_calls == 1);

+ 3 - 0
include/starpu_config.h.in

@@ -41,6 +41,8 @@
 #undef STARPU_SIMGRID
 #undef STARPU_SIMGRID
 #undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
 #undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
 #undef STARPU_HAVE_SIMGRID_MSG_H
 #undef STARPU_HAVE_SIMGRID_MSG_H
+#undef STARPU_HAVE_XBT_SYNCHRO_H
+#undef STARPU_NON_BLOCKING_DRIVERS
 
 
 #undef STARPU_HAVE_ICC
 #undef STARPU_HAVE_ICC
 
 
@@ -101,6 +103,7 @@
 
 
 #undef STARPU_HAVE_WINDOWS
 #undef STARPU_HAVE_WINDOWS
 #undef STARPU_LINUX_SYS
 #undef STARPU_LINUX_SYS
+#undef STARPU_HAVE_SETENV
 #undef STARPU_HAVE_UNSETENV
 #undef STARPU_HAVE_UNSETENV
 #undef STARPU_HAVE_UNISTD_H
 #undef STARPU_HAVE_UNISTD_H
 
 

+ 1 - 1
include/starpu_data.h

@@ -62,7 +62,7 @@ void starpu_data_invalidate_submit(starpu_data_handle_t handle);
 void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important);
 void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important);
 
 
 #define STARPU_ACQUIRE_NO_NODE -1
 #define STARPU_ACQUIRE_NO_NODE -1
-#define STARPU_ACQUIRE_ALL_NODES -2
+#define STARPU_ACQUIRE_NO_NODE_LOCK_ALL -2
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
 int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode);
 int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode);
 int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
 int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);

+ 2 - 2
include/starpu_sched_component.h

@@ -112,8 +112,8 @@ int starpu_sched_component_worker_get_workerid(struct starpu_sched_component *wo
 int starpu_sched_component_is_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_combined_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_combined_worker(struct starpu_sched_component *component);
-void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task);
-void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task);
+void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
+void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
 
 
 struct starpu_sched_component_fifo_data
 struct starpu_sched_component_fifo_data
 {
 {

+ 8 - 1
include/starpu_sched_ctx.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010 - 2012  INRIA
  * Copyright (C) 2010 - 2012  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,6 +34,8 @@ extern "C"
 #define STARPU_SCHED_CTX_AWAKE_WORKERS           (7<<16)
 #define STARPU_SCHED_CTX_AWAKE_WORKERS           (7<<16)
 #define STARPU_SCHED_CTX_POLICY_INIT             (8<<16)
 #define STARPU_SCHED_CTX_POLICY_INIT             (8<<16)
 #define STARPU_SCHED_CTX_USER_DATA               (9<<16)
 #define STARPU_SCHED_CTX_USER_DATA               (9<<16)
+#define STARPU_SCHED_CTX_CUDA_NSMS               (10<<16)
+#define STARPU_SCHED_CTX_SUB_CTXS                (11<<16)
 
 
 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 
 
@@ -157,7 +160,7 @@ unsigned starpu_sched_ctx_master_get_context(int masterid);
 
 
 void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
 void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
 
 
-void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex);
+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex, unsigned with_repush);
 
 
 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
 
 
@@ -168,6 +171,10 @@ unsigned starpu_sched_ctx_has_starpu_scheduler(unsigned sched_ctx_id, unsigned *
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif /* STARPU_USE_SC_HYPERVISOR */
 #endif /* STARPU_USE_SC_HYPERVISOR */
 
 
+int starpu_sched_ctx_get_stream_worker(unsigned sub_ctx);
+int starpu_sched_ctx_get_nsms(unsigned sched_ctx);
+void starpu_sched_ctx_get_sms_interval(int stream_workerid, int *start, int *end);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 3 - 2
include/starpu_scheduler.h

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -39,8 +40,8 @@ struct starpu_sched_policy
 	struct starpu_task *(*pop_every_task)(unsigned sched_ctx_id);
 	struct starpu_task *(*pop_every_task)(unsigned sched_ctx_id);
 
 
 	void (*submit_hook)(struct starpu_task *task);
 	void (*submit_hook)(struct starpu_task *task);
-	void (*pre_exec_hook)(struct starpu_task *);
-	void (*post_exec_hook)(struct starpu_task *);
+	void (*pre_exec_hook)(struct starpu_task *, unsigned sched_ctx_id);
+	void (*post_exec_hook)(struct starpu_task *, unsigned sched_ctx_id);
 
 
 	void (*do_schedule)(unsigned sched_ctx_id);
 	void (*do_schedule)(unsigned sched_ctx_id);
 
 

+ 2 - 0
include/starpu_task.h

@@ -4,6 +4,7 @@
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011, 2014, 2016  INRIA
  * Copyright (C) 2011, 2014, 2016  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -200,6 +201,7 @@ struct starpu_task
 	double flops;
 	double flops;
 	double predicted;
 	double predicted;
 	double predicted_transfer;
 	double predicted_transfer;
+	double predicted_start;
 
 
 	struct starpu_task *prev;
 	struct starpu_task *prev;
 	struct starpu_task *next;
 	struct starpu_task *next;

+ 4 - 0
include/starpu_thread.h

@@ -21,7 +21,11 @@
 #include <starpu_config.h>
 #include <starpu_config.h>
 #include <starpu_util.h>
 #include <starpu_util.h>
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
+#ifdef STARPU_HAVE_XBT_SYNCHRO_H
+#include <xbt/synchro.h>
+#else
 #include <xbt/synchro_core.h>
 #include <xbt/synchro_core.h>
+#endif
 #ifdef STARPU_HAVE_SIMGRID_MSG_H
 #ifdef STARPU_HAVE_SIMGRID_MSG_H
 #include <simgrid/msg.h>
 #include <simgrid/msg.h>
 #else
 #else

+ 7 - 1
include/starpu_worker.h

@@ -3,6 +3,7 @@
  * Copyright (C) 2009-2013, 2016  Université de Bordeaux
  * Copyright (C) 2009-2013, 2016  Université de Bordeaux
  * Copyright (C) 2010-2014  CNRS
  * Copyright (C) 2010-2014  CNRS
  * Copyright (C) 2016  INRIA
  * Copyright (C) 2016  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -108,7 +109,7 @@ enum starpu_worker_archtype starpu_worker_get_type(int id);
 
 
 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type);
 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type);
 
 
-int starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
+unsigned starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, unsigned maxsize);
 
 
 int starpu_worker_get_by_type(enum starpu_worker_archtype type, int num);
 int starpu_worker_get_by_type(enum starpu_worker_archtype type, int num);
 
 
@@ -132,6 +133,11 @@ char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type);
 
 
 int starpu_bindid_get_workerids(int bindid, int **workerids);
 int starpu_bindid_get_workerids(int bindid, int **workerids);
 
 
+int starpu_worker_get_devids(enum starpu_worker_archtype type, int *devids, int num);
+
+int starpu_worker_get_stream_workerids(unsigned devid, int *workerids, enum starpu_worker_archtype type);
+
+unsigned starpu_worker_get_sched_ctx_id_stream(unsigned stream_workerid);
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 1 - 1
min-dgels/Makefile.in

@@ -5,7 +5,7 @@ all:
 	mkdir -p build
 	mkdir -p build
 	cd $(CLAPACK) && $(MAKE) blaslib
 	cd $(CLAPACK) && $(MAKE) blaslib
 	cd $(CLAPACK) && $(MAKE) f2clib
 	cd $(CLAPACK) && $(MAKE) f2clib
-	cd $(ADDITIONAL) && gcc -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
+	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
 
 
 install:
 install:
 
 

+ 3 - 2
min-dgels/base/make.inc

@@ -21,11 +21,12 @@ PLAT = _LINUX
 #
 #
 #######################################################
 #######################################################
 # This is used to compile C libary
 # This is used to compile C libary
-CC        = gcc
+#CC        = gcc
 # if no wrapping of the blas library is needed, uncomment next line
 # if no wrapping of the blas library is needed, uncomment next line
 #CC        = gcc -DNO_BLAS_WRAP
 #CC        = gcc -DNO_BLAS_WRAP
 CFLAGS    = -O3 -I$(TOPDIR)/INCLUDE -fPIC
 CFLAGS    = -O3 -I$(TOPDIR)/INCLUDE -fPIC
-LOADER    = gcc
+#LOADER    = gcc
+LOADER    = $(CC)
 LOADOPTS  =
 LOADOPTS  =
 NOOPT     = -O0 -I$(TOPDIR)/INCLUDE
 NOOPT     = -O0 -I$(TOPDIR)/INCLUDE
 DRVCFLAGS = $(CFLAGS)
 DRVCFLAGS = $(CFLAGS)

+ 2 - 2
mpi/examples/comm/comm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2015  CNRS
+ * Copyright (C) 2015, 2016  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -41,7 +41,7 @@ struct starpu_codelet mycodelet =
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	int size, n, x=789;
+	int size, x=789;
 	int color;
 	int color;
 	MPI_Comm newcomm;
 	MPI_Comm newcomm;
 	int rank, newrank;
 	int rank, newrank;

+ 2 - 0
mpi/examples/matrix_decomposition/mpi_cholesky.c

@@ -29,7 +29,9 @@ int main(int argc, char **argv)
 	float ***bmat;
 	float ***bmat;
 	int rank, nodes, ret;
 	int rank, nodes, ret;
 	double timing, flops;
 	double timing, flops;
+#ifndef STARPU_SIMGRID
 	int correctness;
 	int correctness;
+#endif
 
 
 	ret = starpu_init(NULL);
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

+ 5 - 3
mpi/examples/matrix_decomposition/mpi_decomposition_matrix.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2009-2012, 2015  Université de Bordeaux
  * Copyright (C) 2009-2012, 2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,19 +28,21 @@ int my_distrib(int x, int y, int nb_nodes)
 
 
 void matrix_display(float ***bmat, int rank)
 void matrix_display(float ***bmat, int rank)
 {
 {
-	unsigned i,j,x,y;
-
 	if (display)
 	if (display)
 	{
 	{
+		unsigned y;
 		printf("[%d] Input :\n", rank);
 		printf("[%d] Input :\n", rank);
 
 
 		for(y=0 ; y<nblocks ; y++)
 		for(y=0 ; y<nblocks ; y++)
 		{
 		{
+			unsigned x;
 			for(x=0 ; x<nblocks ; x++)
 			for(x=0 ; x<nblocks ; x++)
 			{
 			{
+				unsigned j;
 				printf("Block %u,%u :\n", x, y);
 				printf("Block %u,%u :\n", x, y);
 				for (j = 0; j < BLOCKSIZE; j++)
 				for (j = 0; j < BLOCKSIZE; j++)
 				{
 				{
+					unsigned i;
 					for (i = 0; i < BLOCKSIZE; i++)
 					for (i = 0; i < BLOCKSIZE; i++)
 					{
 					{
 						if (i <= j)
 						if (i <= j)

+ 5 - 2
mpi/examples/mpi_lu/plu_example.c

@@ -458,10 +458,10 @@ int main(int argc, char **argv)
 	TYPE *a_r = NULL;
 	TYPE *a_r = NULL;
 //	STARPU_PLU(display_data_content)(a_r, size);
 //	STARPU_PLU(display_data_content)(a_r, size);
 
 
-	TYPE *x, *y;
-
 	if (check)
 	if (check)
 	{
 	{
+		TYPE *x, *y;
+
 		x = calloc(size, sizeof(TYPE));
 		x = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(x);
 		STARPU_ASSERT(x);
 
 
@@ -481,6 +481,9 @@ int main(int argc, char **argv)
 			STARPU_PLU(display_data_content)(a_r, size);
 			STARPU_PLU(display_data_content)(a_r, size);
 
 
 //		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
 //		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+
+		free(x);
+		free(y);
 	}
 	}
 
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);

+ 6 - 3
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011, 2013  Université de Bordeaux
  * Copyright (C) 2010-2011, 2013  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -260,10 +260,10 @@ int main(int argc, char **argv)
 	TYPE *a_r = NULL;
 	TYPE *a_r = NULL;
 //	STARPU_PLU(display_data_content)(a_r, size);
 //	STARPU_PLU(display_data_content)(a_r, size);
 
 
-	TYPE *x, *y;
-
 	if (check)
 	if (check)
 	{
 	{
+		TYPE *x, *y;
+
 		x = calloc(size, sizeof(TYPE));
 		x = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(x);
 		STARPU_ASSERT(x);
 
 
@@ -283,6 +283,9 @@ int main(int argc, char **argv)
 			STARPU_PLU(display_data_content)(a_r, size);
 			STARPU_PLU(display_data_content)(a_r, size);
 
 
 //		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
 //		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+
+		free(x);
+		free(y);
 	}
 	}
 
 
 	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
 	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);

+ 6 - 3
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux
  * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -278,10 +278,10 @@ int main(int argc, char **argv)
 	TYPE *a_r = NULL;
 	TYPE *a_r = NULL;
 //	STARPU_PLU(display_data_content)(a_r, size);
 //	STARPU_PLU(display_data_content)(a_r, size);
 
 
-	TYPE *x, *y;
-
 	if (check)
 	if (check)
 	{
 	{
+		TYPE *x, *y;
+
 		x = calloc(size, sizeof(TYPE));
 		x = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(x);
 		STARPU_ASSERT(x);
 
 
@@ -301,6 +301,9 @@ int main(int argc, char **argv)
 			STARPU_PLU(display_data_content)(a_r, size);
 			STARPU_PLU(display_data_content)(a_r, size);
 
 
 //		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
 //		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+
+		free(x);
+		free(y);
 	}
 	}
 
 
 	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
 	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);

+ 1 - 1
mpi/src/starpu_mpi.c

@@ -1303,7 +1303,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
 	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
 	/* And set TSD for us */
 	/* And set TSD for us */
 #ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
 #ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
-	smpi_process_set_user_data(calloc(MAX_TSD, sizeof(void*)));
+	smpi_process_set_user_data(calloc(MAX_TSD + 1, sizeof(void*)));
 #endif
 #endif
 #endif
 #endif
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT

+ 5 - 1
mpi/tests/mpi_reduction.c

@@ -198,10 +198,14 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-#ifndef STARPU_SIMGRID
 	if (my_rank == 0)
 	if (my_rank == 0)
 	{
 	{
 		FPRINTF(stderr, "[%d] sum=%ld\n", my_rank, sum);
 		FPRINTF(stderr, "[%d] sum=%ld\n", my_rank, sum);
+	}
+
+#ifndef STARPU_SIMGRID
+	if (my_rank == 0)
+	{
 		FPRINTF(stderr, "[%d] dot=%ld\n", my_rank, dot);
 		FPRINTF(stderr, "[%d] dot=%ld\n", my_rank, dot);
 		FPRINTF(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
 		FPRINTF(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
 		if (sum != dot)
 		if (sum != dot)

+ 3 - 3
socl/src/cl_createcontextfromtype.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012 University of Bordeaux
+ * Copyright (C) 2010-2012, 2016 University of Bordeaux
  * Copyright (C) 2012 CNRS
  * Copyright (C) 2012 CNRS
  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
  *
  *
@@ -26,8 +26,8 @@ soclCreateContextFromType(const cl_context_properties * properties,
                         void *                        user_data,
                         void *                        user_data,
                         cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_0
                         cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_0
 {
 {
-   if( ! _starpu_init )
-      socl_init_starpu(); 
+    if (socl_init_starpu() < 0)
+      return NULL;
 
 
 
 
    //TODO: appropriate error messages
    //TODO: appropriate error messages

+ 2 - 2
socl/src/cl_enqueuendrangekernel.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010,2011 University of Bordeaux
+ * Copyright (C) 2010,2011, 2016 University of Bordeaux
  * Copyright (C) 2016  CNRS
  * Copyright (C) 2016  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -174,7 +174,7 @@ soclEnqueueNDRangeKernel(cl_command_queue cq,
       cl_uint iter = 1;
       cl_uint iter = 1;
       cl_uint split_min = CL_UINT_MAX;
       cl_uint split_min = CL_UINT_MAX;
       cl_uint split_min_iter = 1;
       cl_uint split_min_iter = 1;
-      while (kernel->split_perfs[iter] != 0 && iter < kernel->split_space) {
+      while (iter < kernel->split_space && kernel->split_perfs[iter] != 0) {
          if (kernel->split_perfs[iter] < split_min) {
          if (kernel->split_perfs[iter] < split_min) {
             split_min = kernel->split_perfs[iter];
             split_min = kernel->split_perfs[iter];
             split_min_iter = iter;
             split_min_iter = iter;

+ 6 - 3
socl/src/cl_getdeviceids.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012 University of Bordeaux
+ * Copyright (C) 2010-2012, 2016 University of Bordeaux
  * Copyright (C) 2012 CNRS
  * Copyright (C) 2012 CNRS
  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
  *
  *
@@ -31,8 +31,11 @@ soclGetDeviceIDs(cl_platform_id   platform,
                cl_device_id *   devices,
                cl_device_id *   devices,
                cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0
                cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0
 {
 {
-   if( ! _starpu_init )
-      socl_init_starpu();
+    if (socl_init_starpu() < 0)
+    {
+       *num_devices = 0;
+       return CL_SUCCESS;
+    }
 
 
    if (_starpu_init_failed) {
    if (_starpu_init_failed) {
       *num_devices = 0;
       *num_devices = 0;

+ 54 - 24
socl/src/init.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012 University of Bordeaux
+ * Copyright (C) 2010-2012, 2016 University of Bordeaux
  * Copyright (C) 2012,2014,2016 CNRS
  * Copyright (C) 2012,2014,2016 CNRS
  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
  *
  *
@@ -17,42 +17,72 @@
  */
  */
 
 
 #include <stdlib.h>
 #include <stdlib.h>
+#include "../src/core/workers.h"
 #include "socl.h"
 #include "socl.h"
 #include "gc.h"
 #include "gc.h"
 #include "mem_objects.h"
 #include "mem_objects.h"
 
 
 int _starpu_init_failed;
 int _starpu_init_failed;
-volatile int _starpu_init = 0;
+static enum initialization _socl_init = UNINITIALIZED;
 static starpu_pthread_mutex_t _socl_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static starpu_pthread_mutex_t _socl_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
+static starpu_pthread_cond_t _socl_cond = STARPU_PTHREAD_COND_INITIALIZER;
+static pthread_t _socl_thread_init;
 static struct starpu_conf conf;
 static struct starpu_conf conf;
 
 
-void socl_init_starpu(void) {
+int socl_init_starpu(void) {
   STARPU_PTHREAD_MUTEX_LOCK(&_socl_mutex);
   STARPU_PTHREAD_MUTEX_LOCK(&_socl_mutex);
-  if( ! _starpu_init ){
-    starpu_conf_init(&conf);
-    conf.ncuda = 0;
-    conf.ncpus = 0;
-
+  if (_socl_init == INITIALIZED)
+  {
+    STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
+    return 0;
+  }
 
 
-    _starpu_init_failed = starpu_init(&conf);
-    if (_starpu_init_failed != 0)
+  if (_socl_init == CHANGING)
+  {
+    /* Avoid recursion when starpu_init calls hwloc initialization which uses its opencl plugin */
+    if (pthread_equal(_socl_thread_init, pthread_self()))
     {
     {
-       DEBUG_MSG("Error when calling starpu_init: %d\n", _starpu_init_failed);
-    }
-    else {
-       if (starpu_opencl_worker_get_count() == 0)
-       {
-	    DEBUG_MSG("StarPU didn't find any OpenCL device. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
-	    _starpu_init_failed = -ENODEV;
-       }
+      STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
+      return -1;
     }
     }
 
 
-    /* Disable dataflow implicit dependencies */
-    starpu_data_set_default_sequential_consistency_flag(0);
-    _starpu_init = 1;
+    /* Somebody else is initializing already, wait for him */
+    while (_socl_init != INITIALIZED)
+      STARPU_PTHREAD_COND_WAIT(&_socl_cond, &_socl_mutex);
+    STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
+    return 0;
   }
   }
+  _socl_init = CHANGING;
+  _socl_thread_init = pthread_self();
+  STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
+
+  starpu_conf_init(&conf);
+  conf.ncuda = 0;
+  conf.ncpus = 0;
+
+
+  _starpu_init_failed = starpu_init(&conf);
+  if (_starpu_init_failed != 0)
+  {
+     DEBUG_MSG("Error when calling starpu_init: %d\n", _starpu_init_failed);
+  }
+  else {
+     if (starpu_opencl_worker_get_count() == 0)
+     {
+	  DEBUG_MSG("StarPU didn't find any OpenCL device. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
+	  _starpu_init_failed = -ENODEV;
+     }
+  }
+
+  /* Disable dataflow implicit dependencies */
+  starpu_data_set_default_sequential_consistency_flag(0);
+
+  STARPU_PTHREAD_MUTEX_LOCK(&_socl_mutex);
+  _socl_init = INITIALIZED;
+  STARPU_PTHREAD_COND_BROADCAST(&_socl_cond);
   STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
   STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
 
 
+  return 0;
 }
 }
 /**
 /**
  * Initialize SOCL
  * Initialize SOCL
@@ -73,12 +103,12 @@ void soclShutdown() {
       shutdown = 1;
       shutdown = 1;
 
 
       STARPU_PTHREAD_MUTEX_LOCK(&_socl_mutex);
       STARPU_PTHREAD_MUTEX_LOCK(&_socl_mutex);
-      if( _starpu_init )
+      if( _socl_init )
          starpu_task_wait_for_all();
          starpu_task_wait_for_all();
 
 
       gc_stop();
       gc_stop();
 
 
-      if( _starpu_init )
+      if( _socl_init )
          starpu_task_wait_for_all();
          starpu_task_wait_for_all();
 
 
       int active_entities = gc_active_entity_count();
       int active_entities = gc_active_entity_count();
@@ -88,7 +118,7 @@ void soclShutdown() {
          gc_print_remaining_entities();
          gc_print_remaining_entities();
       }
       }
 
 
-      if( _starpu_init && _starpu_init_failed != -ENODEV)
+      if( _socl_init && _starpu_init_failed != -ENODEV)
          starpu_shutdown();
          starpu_shutdown();
       STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
       STARPU_PTHREAD_MUTEX_UNLOCK(&_socl_mutex);
 
 

+ 2 - 2
socl/src/init.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012 University of Bordeaux
+ * Copyright (C) 2010-2012, 2016 University of Bordeaux
  * Copyright (C) 2012, 2014 CNRS
  * Copyright (C) 2012, 2014 CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,7 +28,7 @@ extern volatile int _starpu_init;
  * Initialize StarPU
  * Initialize StarPU
  */
  */
 
 
-void socl_init_starpu(void);
+int socl_init_starpu(void);
 void soclShutdown(void);
 void soclShutdown(void);
 
 
 #endif /* SOCL_INIT_H */
 #endif /* SOCL_INIT_H */

+ 21 - 10
src/common/graph.c

@@ -57,6 +57,7 @@ void _starpu_graph_init(void)
 	_starpu_graph_node_multilist_init_dropped(&dropped);
 	_starpu_graph_node_multilist_init_dropped(&dropped);
 }
 }
 
 
+/* LockWR the graph lock */
 void _starpu_graph_wrlock(void)
 void _starpu_graph_wrlock(void)
 {
 {
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
@@ -64,11 +65,12 @@ void _starpu_graph_wrlock(void)
 
 
 void _starpu_graph_drop_node(struct _starpu_graph_node *node);
 void _starpu_graph_drop_node(struct _starpu_graph_node *node);
 
 
+/* This flushes the list of nodes to be dropped. Both the dropped_lock and
+ * graph_lock mutexes have to be held on entry, and are released.  */
 void _starpu_graph_drop_dropped_nodes(void)
 void _starpu_graph_drop_dropped_nodes(void)
 {
 {
 	struct _starpu_graph_node_multilist_dropped dropping;
 	struct _starpu_graph_node_multilist_dropped dropping;
 
 
-	STARPU_PTHREAD_MUTEX_LOCK(&dropped_lock);
 	/* Pick up the list of dropped nodes */
 	/* Pick up the list of dropped nodes */
 	_starpu_graph_node_multilist_move_dropped(&dropped, &dropping);
 	_starpu_graph_node_multilist_move_dropped(&dropped, &dropping);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&dropped_lock);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&dropped_lock);
@@ -78,7 +80,6 @@ void _starpu_graph_drop_dropped_nodes(void)
 	{
 	{
 		struct _starpu_graph_node *node, *next;
 		struct _starpu_graph_node *node, *next;
 
 
-		STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
 		for (node = _starpu_graph_node_multilist_begin_dropped(&dropping);
 		for (node = _starpu_graph_node_multilist_begin_dropped(&dropping);
 		     node != _starpu_graph_node_multilist_end_dropped(&dropping);
 		     node != _starpu_graph_node_multilist_end_dropped(&dropping);
 		     node = next)
 		     node = next)
@@ -86,24 +87,31 @@ void _starpu_graph_drop_dropped_nodes(void)
 			next = _starpu_graph_node_multilist_next_dropped(node);
 			next = _starpu_graph_node_multilist_next_dropped(node);
 			_starpu_graph_drop_node(node);
 			_starpu_graph_drop_node(node);
 		}
 		}
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
 	}
 	}
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
 }
 }
 
 
+/* UnlockWR the graph lock */
 void _starpu_graph_wrunlock(void)
 void _starpu_graph_wrunlock(void)
 {
 {
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	STARPU_PTHREAD_MUTEX_LOCK(&dropped_lock);
 	_starpu_graph_drop_dropped_nodes();
 	_starpu_graph_drop_dropped_nodes();
 }
 }
 
 
+/* LockRD the graph lock */
 void _starpu_graph_rdlock(void)
 void _starpu_graph_rdlock(void)
 {
 {
 	STARPU_PTHREAD_RWLOCK_RDLOCK(&graph_lock);
 	STARPU_PTHREAD_RWLOCK_RDLOCK(&graph_lock);
 }
 }
 
 
+/* UnlockRD the graph lock */
 void _starpu_graph_rdunlock(void)
 void _starpu_graph_rdunlock(void)
 {
 {
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+	/* Take the opportunity to try to take it WR */
+	if (STARPU_PTHREAD_RWLOCK_TRYWRLOCK(&graph_lock) == 0)
+		/* Good, flush dropped nodes */
+		_starpu_graph_wrunlock();
 }
 }
 
 
 static void __starpu_graph_foreach(void (*func)(void *data, struct _starpu_graph_node *node), void *data)
 static void __starpu_graph_foreach(void (*func)(void *data, struct _starpu_graph_node *node), void *data)
@@ -163,6 +171,8 @@ void _starpu_graph_add_job_dep(struct _starpu_job *job, struct _starpu_job *prev
 	_starpu_graph_wrlock();
 	_starpu_graph_wrlock();
 	struct _starpu_graph_node *node = job->graph_node;
 	struct _starpu_graph_node *node = job->graph_node;
 	struct _starpu_graph_node *prev_node = prev_job->graph_node;
 	struct _starpu_graph_node *prev_node = prev_job->graph_node;
+	if (!node || !prev_node)
+		return;
 
 
 	if (_starpu_graph_node_multilist_queued_bottom(prev_node))
 	if (_starpu_graph_node_multilist_queued_bottom(prev_node))
 		/* Previous node is not at bottom any more */
 		/* Previous node is not at bottom any more */
@@ -217,6 +227,8 @@ void _starpu_graph_drop_job(struct _starpu_job *job)
 {
 {
 	struct _starpu_graph_node *node = job->graph_node;
 	struct _starpu_graph_node *node = job->graph_node;
 	job->graph_node = NULL;
 	job->graph_node = NULL;
+	if (!node)
+		return;
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&node->mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&node->mutex);
 	/* Will not be able to use the job any more */
 	/* Will not be able to use the job any more */
@@ -224,16 +236,15 @@ void _starpu_graph_drop_job(struct _starpu_job *job)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->mutex);
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&dropped_lock);
 	STARPU_PTHREAD_MUTEX_LOCK(&dropped_lock);
+	/* Queue for removal when lock becomes available */
+	_starpu_graph_node_multilist_push_back_dropped(&dropped, node);
 	if (STARPU_PTHREAD_RWLOCK_TRYWRLOCK(&graph_lock) == 0)
 	if (STARPU_PTHREAD_RWLOCK_TRYWRLOCK(&graph_lock) == 0)
 	{
 	{
-		/* Graph wrlock is available, drop node immediately */
-		_starpu_graph_drop_node(node);
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&graph_lock);
+		/* Graph wrlock is available, drop nodes immediately */
+		_starpu_graph_drop_dropped_nodes();
 	}
 	}
 	else
 	else
-		/* Queue for removal when lock becomes available */
-		_starpu_graph_node_multilist_push_back_dropped(&dropped, node);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&dropped_lock);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&dropped_lock);
 }
 }
 
 
 static void _starpu_graph_set_n(void *data, struct _starpu_graph_node *node)
 static void _starpu_graph_set_n(void *data, struct _starpu_graph_node *node)

+ 1 - 1
src/common/list.h

@@ -271,7 +271,7 @@ static inline int ENAME##_multilist_queued_##MEMBER(TYPE *e) { \
 \
 \
 /* Test whether the list is empty.  */ \
 /* Test whether the list is empty.  */ \
 static inline int ENAME##_multilist_empty_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
 static inline int ENAME##_multilist_empty_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
-	return head->next != head; \
+	return head->next == head; \
 } \
 } \
 \
 \
 /* Return the first element of the list.  */ \
 /* Return the first element of the list.  */ \

+ 9 - 3
src/common/thread.c

@@ -20,7 +20,11 @@
 #include <core/workers.h>
 #include <core/workers.h>
 
 
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
+#ifdef STARPU_HAVE_XBT_SYNCHRO_H
+#include <xbt/synchro.h>
+#else
 #include <xbt/synchro_core.h>
 #include <xbt/synchro_core.h>
+#endif
 #include <smpi/smpi.h>
 #include <smpi/smpi.h>
 #else
 #else
 
 
@@ -53,7 +57,7 @@ int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_
 	_args[2] = NULL;
 	_args[2] = NULL;
 	if (!host)
 	if (!host)
 		host = MSG_get_host_by_name("MAIN");
 		host = MSG_get_host_by_name("MAIN");
-	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, calloc(MAX_TSD, sizeof(void*)), host, 2, _args);
+	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, calloc(MAX_TSD+1, sizeof(void*)), host, 2, _args);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -181,6 +185,7 @@ int starpu_pthread_mutexattr_init(starpu_pthread_mutexattr_t *attr STARPU_ATTRIB
 }
 }
 
 
 
 
+/* Indexed by key-1 */
 static int used_key[MAX_TSD];
 static int used_key[MAX_TSD];
 
 
 int starpu_pthread_key_create(starpu_pthread_key_t *key, void (*destr_function) (void *) STARPU_ATTRIBUTE_UNUSED)
 int starpu_pthread_key_create(starpu_pthread_key_t *key, void (*destr_function) (void *) STARPU_ATTRIBUTE_UNUSED)
@@ -195,13 +200,14 @@ int starpu_pthread_key_create(starpu_pthread_key_t *key, void (*destr_function)
 			break;
 			break;
 		}
 		}
 	STARPU_ASSERT(i < MAX_TSD);
 	STARPU_ASSERT(i < MAX_TSD);
-	*key = i;
+	/* key 0 is for process pointer argument */
+	*key = i+1;
 	return 0;
 	return 0;
 }
 }
 
 
 int starpu_pthread_key_delete(starpu_pthread_key_t key)
 int starpu_pthread_key_delete(starpu_pthread_key_t key)
 {
 {
-	used_key[key] = 0;
+	used_key[key-1] = 0;
 	return 0;
 	return 0;
 }
 }
 
 

+ 13 - 0
src/common/uthash.h

@@ -147,9 +147,22 @@ do {
 #define HASH_ADD(hh,head,fieldname,keylen_in,add)                                \
 #define HASH_ADD(hh,head,fieldname,keylen_in,add)                                \
         HASH_ADD_KEYPTR(hh,head,&add->fieldname,keylen_in,add)
         HASH_ADD_KEYPTR(hh,head,&add->fieldname,keylen_in,add)
  
  
+#ifdef STARPU_DEBUG
+/* Check that we don't insert the same key several times */
+#define HASH_CHECK_KEY(hh,head,keyptr,keylen,out)                                \
+do {                                                                             \
+  __typeof__(out) _out;                                                          \
+  HASH_FIND(hh,head,keyptr,keylen,_out);                                         \
+  STARPU_ASSERT(!_out);                                                          \
+} while(0)
+#else
+#define HASH_CHECK_KEY(hh,head,keyptr,keylen,out)
+#endif
+
 #define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add)                            \
 #define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add)                            \
 do {                                                                             \
 do {                                                                             \
  unsigned _ha_bkt;                                                               \
  unsigned _ha_bkt;                                                               \
+ HASH_CHECK_KEY(hh,head,keyptr,keylen_in,add);                                   \
  (add)->hh.next = NULL;                                                          \
  (add)->hh.next = NULL;                                                          \
  (add)->hh.key = (char*)keyptr;                                                  \
  (add)->hh.key = (char*)keyptr;                                                  \
  (add)->hh.keylen = keylen_in;                                                   \
  (add)->hh.keylen = keylen_in;                                                   \

+ 11 - 26
src/common/utils.c

@@ -37,6 +37,9 @@
 #ifndef O_BINARY
 #ifndef O_BINARY
 #define O_BINARY 0
 #define O_BINARY 0
 #endif
 #endif
+#if !defined(O_DIRECT) && defined(F_NOCACHE)
+#define O_DIRECT F_NOCACHE
+#endif
 
 
 int _starpu_silent;
 int _starpu_silent;
 
 
@@ -157,14 +160,13 @@ char *_starpu_mktemp(const char *directory, int flags, int *fd)
 	*fd = open(baseCpy, flags);
 	*fd = open(baseCpy, flags);
 #elif defined (HAVE_MKOSTEMP)
 #elif defined (HAVE_MKOSTEMP)
 	*fd = mkostemp(baseCpy, flags);
 	*fd = mkostemp(baseCpy, flags);
-#elif defined (O_DIRECT)
+#else
+#  ifdef O_DIRECT
 	STARPU_ASSERT(flags == (O_RDWR | O_BINARY) || flags == (O_RDWR | O_BINARY | O_DIRECT));
 	STARPU_ASSERT(flags == (O_RDWR | O_BINARY) || flags == (O_RDWR | O_BINARY | O_DIRECT));
+#  else
+	STARPU_ASSERT(flags == (O_RDWR | O_BINARY));
+#  endif
 	*fd = mkstemp(baseCpy);
 	*fd = mkstemp(baseCpy);
-#elif defined (STARPU_HAVE_DARWIN) // MACOS
-	STARPU_ASSERT(flags == (O_RDWR | O_BINARY) || flags == (O_RDWR | O_BINARY | F_NOCACHE));
-	*fd = mkstemp(baseCpy);
-#else
-	/* nothing for now */
 #endif
 #endif
 
 
 	/* fail */
 	/* fail */
@@ -177,8 +179,8 @@ char *_starpu_mktemp(const char *directory, int flags, int *fd)
 		return NULL;
 		return NULL;
 	}
 	}
 
 
-#if !defined(STARPU_HAVE_WINDOWS) && !defined (HAVE_MKOSTEMP)
-#if defined (O_DIRECT)
+#if !defined(STARPU_HAVE_WINDOWS) && !defined (HAVE_MKOSTEMP) && defined(O_DIRECT)
+	/* Add O_DIRECT after the mkstemp call */
 	if ((flags & O_DIRECT) != 0)
 	if ((flags & O_DIRECT) != 0)
 	{
 	{
 		int flag = fcntl(*fd, F_GETFL);
 		int flag = fcntl(*fd, F_GETFL);
@@ -186,29 +188,12 @@ char *_starpu_mktemp(const char *directory, int flags, int *fd)
 		if (fcntl(*fd, F_SETFL, flag) < 0)
 		if (fcntl(*fd, F_SETFL, flag) < 0)
 		{
 		{
 			int err = errno;
 			int err = errno;
-			_STARPU_DISP("Could set O_DIRECT on the temporary file  in directory '%s', fcntl failed with error '%s'\n", directory, strerror(errno));
-			free(baseCpy);
-			errno = err;
-			return NULL;
-		}
-	}
-#elif defined (STARPU_HAVE_DARWIN) //MACOS
-	if ((flags & F_NOCACHE) != 0)
-	{
-		int flag = fcntl(*fd, F_GETFL);
-		//flag |= F_NOCACHE;
-		if (fcntl(*fd, F_SETFL, F_NOCACHE) < 0)
-		{
-			int err = errno;
-			_STARPU_DISP("Could set F_NOCACHE on the temporary file in  directory '%s', fcntl failed with error '%s'\n", directory, strerror(errno));
+			_STARPU_DISP("Could set O_DIRECT on the temporary file in directory '%s', fcntl failed with error '%s'\n", directory, strerror(errno));
 			free(baseCpy);
 			free(baseCpy);
 			errno = err;
 			errno = err;
 			return NULL;
 			return NULL;
 		}
 		}
 	}
 	}
-#else
-	/* nothing for now */
-#endif
 #endif
 #endif
 
 
 
 

+ 1 - 0
src/core/dependencies/tags.c

@@ -405,6 +405,7 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 	/* It is forbidden to block within callbacks or codelets */
 	/* It is forbidden to block within callbacks or codelets */
 	STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_tag_wait must not be called from a task or callback");
 	STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_tag_wait must not be called from a task or callback");
 
 
+	starpu_do_schedule();
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
 	/* only wait the tags that are not done yet */
 	/* only wait the tags that are not done yet */
 	for (i = 0, current = 0; i < ntags; i++)
 	for (i = 0, current = 0; i < ntags; i++)

+ 2 - 2
src/core/perfmodel/multiple_regression.c

@@ -201,7 +201,7 @@ int dgels_multiple_reg_coeff(double *mpar, double *my, long nn, unsigned ncoeff,
 	for (i=0; i < m; i++)
 	for (i=0; i < m; i++)
 	{
 	{
 		Y[i] = my[i];
 		Y[i] = my[i];
-		X[i*n] = 1.;
+		X[i] = 1.;
 		for (j=1; j < n; j++)
 		for (j=1; j < n; j++)
 		{
 		{
 			coefficient = 1.;
 			coefficient = 1.;
@@ -209,7 +209,7 @@ int dgels_multiple_reg_coeff(double *mpar, double *my, long nn, unsigned ncoeff,
 			{
 			{
 				coefficient *= pow(mpar[i*nparameters+k],combinations[j-1][k]);
 				coefficient *= pow(mpar[i*nparameters+k],combinations[j-1][k]);
 			}
 			}
-			X[i*n+j] = coefficient;
+			X[i+j*m] = coefficient;
 		}
 		}
 	}
 	}
 
 

+ 4 - 0
src/core/perfmodel/perfmodel.c

@@ -4,6 +4,7 @@
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2016  Inria
  * Copyright (C) 2016  Inria
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -59,6 +60,9 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsi
 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 			return _starpu_sched_ctx_get_perf_archtype(child_sched_ctx);
 			return _starpu_sched_ctx_get_perf_archtype(child_sched_ctx);
+		struct _starpu_sched_ctx *stream_ctx = _starpu_worker_get_ctx_stream(workerid);
+		if(stream_ctx != NULL)
+			return _starpu_sched_ctx_get_perf_archtype(stream_ctx->id); 
 	}
 	}
 
 
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	struct _starpu_machine_config *config = _starpu_get_machine_config();

+ 124 - 17
src/core/sched_ctx.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011, 2013  INRIA
  * Copyright (C) 2011, 2013  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,6 +34,7 @@ static size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
 static double hyp_actual_start_sample[STARPU_NMAX_SCHED_CTXS];
 static double hyp_actual_start_sample[STARPU_NMAX_SCHED_CTXS];
 static double window_size;
 static double window_size;
 static int nobind;
 static int nobind;
+static int occupied_sms = 0;
 
 
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
 static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master);
 static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master);
@@ -147,7 +149,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 					     int *added_workers, int *n_added_workers)
 					     int *added_workers, int *n_added_workers)
 {
 {
 	struct starpu_worker_collection *workers = sched_ctx->workers;
 	struct starpu_worker_collection *workers = sched_ctx->workers;
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
 	int nworkers_to_add = nworkers == -1 ? (int)config->topology.nworkers : nworkers;
 	int nworkers_to_add = nworkers == -1 ? (int)config->topology.nworkers : nworkers;
 	if (!nworkers_to_add)
 	if (!nworkers_to_add)
@@ -297,7 +299,10 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 			{
 			{
 				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].type = devices[dev1].type;
 				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].type = devices[dev1].type;
 				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].devid = devices[dev1].devid;
 				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].devid = devices[dev1].devid;
-				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].ncores = devices[dev1].ncores;
+				if (sched_ctx->stream_worker != -1)
+					sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].ncores = sched_ctx->nsms;
+				else
+					sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].ncores = devices[dev1].ncores;
 				sched_ctx->perf_arch.ndevices++;
 				sched_ctx->perf_arch.ndevices++;
 			}
 			}
 			else
 			else
@@ -472,9 +477,10 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 						   int max_prio_set, int max_prio,
 						   int max_prio_set, int max_prio,
 						   unsigned awake_workers,
 						   unsigned awake_workers,
 						   void (*sched_policy_init)(unsigned),
 						   void (*sched_policy_init)(unsigned),
-						   void * user_data)
+						   void * user_data,
+						   int nsub_ctxs, int *sub_ctxs, int nsms)
 {
 {
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx_manag);
 	STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx_manag);
 	STARPU_ASSERT(config->topology.nsched_ctxs < STARPU_NMAX_SCHED_CTXS);
 	STARPU_ASSERT(config->topology.nsched_ctxs < STARPU_NMAX_SCHED_CTXS);
@@ -526,6 +532,24 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	sched_ctx->perf_arch.ndevices = 0;
 	sched_ctx->perf_arch.ndevices = 0;
 	sched_ctx->init_sched = sched_policy_init;
 	sched_ctx->init_sched = sched_policy_init;
 	sched_ctx->user_data = user_data;
 	sched_ctx->user_data = user_data;
+	sched_ctx->sms_start_idx = 0;
+	sched_ctx->sms_end_idx = STARPU_NMAXSMS;
+	sched_ctx->nsms = nsms;
+	sched_ctx->stream_worker = -1;
+	if(nsms > 0)
+	{
+		STARPU_ASSERT_MSG(workerids, "workerids is needed when setting nsms");
+		sched_ctx->sms_start_idx = occupied_sms;
+		sched_ctx->sms_end_idx = occupied_sms+nsms;
+		occupied_sms += nsms;
+		_STARPU_DEBUG("ctx %d: stream worker %d nsms %d ocupied sms %d\n", sched_ctx->id, workerids[0], nsms, occupied_sms);
+		STARPU_ASSERT_MSG(occupied_sms <= STARPU_NMAXSMS , "STARPU:requested more sms than available");
+		_starpu_worker_set_stream_ctx(workerids[0], sched_ctx);
+		sched_ctx->stream_worker = workerids[0];
+	}
+
+	sched_ctx->nsub_ctxs = 0;
+
 	int w;
 	int w;
 	for(w = 0; w < nworkers; w++)
 	for(w = 0; w < nworkers; w++)
 	{
 	{
@@ -565,6 +589,15 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 		  }
 		  }
 	}
 	}
 
 
+        /*add sub_ctxs before add workers, in order to be able to associate them if necessary */
+	if(nsub_ctxs != 0)
+	{
+		int i;
+		for(i = 0; i < nsub_ctxs; i++)
+			sched_ctx->sub_ctxs[i] = sub_ctxs[i];
+		sched_ctx->nsub_ctxs = nsub_ctxs;
+	}
+	
 	/* after having an worker_collection on the ressources add them */
 	/* after having an worker_collection on the ressources add them */
 	_starpu_add_workers_to_sched_ctx(sched_ctx, workerids, nworkers_ctx, NULL, NULL);
 	_starpu_add_workers_to_sched_ctx(sched_ctx, workerids, nworkers_ctx, NULL, NULL);
 
 
@@ -595,7 +628,7 @@ static void _get_workers(int min, int max, int *workers, int *nw, enum starpu_wo
 	int npus = 0;
 	int npus = 0;
 	int i;
 	int i;
 
 
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	if(config->topology.nsched_ctxs == 1)
 	if(config->topology.nsched_ctxs == 1)
 	{
 	{
 		/*we have all available resources */
 		/*we have all available resources */
@@ -709,7 +742,7 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 						 int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus,
 						 int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus,
 						 unsigned allow_overlap)
 						 unsigned allow_overlap)
 {
 {
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(config, policy_name);
 	struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(config, policy_name);
 
 
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	struct _starpu_sched_ctx *sched_ctx = NULL;
@@ -724,7 +757,7 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 	for(i = 0; i < nw; i++)
 	for(i = 0; i < nw; i++)
 		printf("%d ", workers[i]);
 		printf("%d ", workers[i]);
 	printf("\n");
 	printf("\n");
-	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_ctx_name, 0, 0, 0, 0, 1, NULL, NULL);
+	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_ctx_name, 0, 0, 0, 0, 1, NULL, NULL,0, NULL, 0);
 	sched_ctx->min_ncpus = min_ncpus;
 	sched_ctx->min_ncpus = min_ncpus;
 	sched_ctx->max_ncpus = max_ncpus;
 	sched_ctx->max_ncpus = max_ncpus;
 	sched_ctx->min_ngpus = min_ngpus;
 	sched_ctx->min_ngpus = min_ngpus;
@@ -742,6 +775,45 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 
 
 }
 }
 
 
+int starpu_sched_ctx_get_nsms(unsigned sched_ctx)
+{
+	struct _starpu_sched_ctx *sc = _starpu_get_sched_ctx_struct(sched_ctx);
+	return sc->nsms;
+}
+
+void starpu_sched_ctx_get_sms_interval(int stream_workerid, int *start, int *end)
+{
+	struct _starpu_sched_ctx *sc = _starpu_worker_get_ctx_stream(stream_workerid);
+	*start = sc->sms_start_idx;
+	*end = sc->sms_end_idx;
+}
+
+int starpu_sched_ctx_get_sub_ctxs(unsigned sched_ctx, int *ctxs)
+{
+	struct _starpu_sched_ctx *sc = _starpu_get_sched_ctx_struct(sched_ctx);
+	int i;
+	for(i = 0; i < sc->nsub_ctxs; i++)
+		    ctxs[i] = sc->sub_ctxs[i];
+	return sc->nsub_ctxs;
+}
+
+int starpu_sched_ctx_get_stream_worker(unsigned sub_ctx)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sub_ctx);
+	struct starpu_worker_collection *workers = sched_ctx->workers;
+
+	struct starpu_sched_ctx_iterator it;
+	int worker = -1;
+	
+	workers->init_iterator(workers, &it);
+	if(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+	}
+
+	return worker;
+}
+
 unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx_name, ...)
 unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx_name, ...)
 {
 {
 	va_list varg_list;
 	va_list varg_list;
@@ -750,6 +822,9 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	int max_prio_set = 0;
 	int max_prio_set = 0;
 	int min_prio = 0;
 	int min_prio = 0;
 	int max_prio = 0;
 	int max_prio = 0;
+	int nsms = 0;
+        int *sub_ctxs = NULL;
+        int nsub_ctxs = 0;
 	void *user_data = NULL;
 	void *user_data = NULL;
 	struct starpu_sched_policy *sched_policy = NULL;
 	struct starpu_sched_policy *sched_policy = NULL;
 	unsigned hierarchy_level = 0;
 	unsigned hierarchy_level = 0;
@@ -763,7 +838,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 		if (arg_type == STARPU_SCHED_CTX_POLICY_NAME)
 		if (arg_type == STARPU_SCHED_CTX_POLICY_NAME)
 		{
 		{
 			char *policy_name = va_arg(varg_list, char *);
 			char *policy_name = va_arg(varg_list, char *);
-			struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+			struct _starpu_machine_config *config = _starpu_get_machine_config();
 			sched_policy = _starpu_select_sched_policy(config, policy_name);
 			sched_policy = _starpu_select_sched_policy(config, policy_name);
 		}
 		}
 		else if (arg_type == STARPU_SCHED_CTX_POLICY_STRUCT)
 		else if (arg_type == STARPU_SCHED_CTX_POLICY_STRUCT)
@@ -800,6 +875,15 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 		{
 		{
 			user_data = va_arg(varg_list, void *);
 			user_data = va_arg(varg_list, void *);
 		}
 		}
+		else if (arg_type == STARPU_SCHED_CTX_SUB_CTXS)
+		{
+			sub_ctxs = va_arg(varg_list, int*);
+			nsub_ctxs = va_arg(varg_list, int);
+		}
+		else if (arg_type == STARPU_SCHED_CTX_CUDA_NSMS)
+		{
+			nsms = va_arg(varg_list, int);
+		}
 		else
 		else
 		{
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
@@ -824,7 +908,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	}
 	}
 
 
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	struct _starpu_sched_ctx *sched_ctx = NULL;
-	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers, init_sched, user_data);
+	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers, init_sched, user_data, nsub_ctxs, sub_ctxs, nsms);
 	sched_ctx->hierarchy_level = hierarchy_level;
 	sched_ctx->hierarchy_level = hierarchy_level;
 	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
 	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
 
 
@@ -848,6 +932,9 @@ int fstarpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx
 	int max_prio_set = 0;
 	int max_prio_set = 0;
 	int min_prio = 0;
 	int min_prio = 0;
 	int max_prio = 0;
 	int max_prio = 0;
+	int nsms = 0;
+        int *sub_ctxs = NULL;
+        int nsub_ctxs = 0;
 	void *user_data = NULL;
 	void *user_data = NULL;
 	struct starpu_sched_policy *sched_policy = NULL;
 	struct starpu_sched_policy *sched_policy = NULL;
 	unsigned hierarchy_level = 0;
 	unsigned hierarchy_level = 0;
@@ -862,7 +949,7 @@ int fstarpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx
 		{
 		{
 			arg_i++;
 			arg_i++;
 			char *policy_name = arglist[arg_i];
 			char *policy_name = arglist[arg_i];
-			struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+			struct _starpu_machine_config *config = _starpu_get_machine_config();
 			sched_policy = _starpu_select_sched_policy(config, policy_name);
 			sched_policy = _starpu_select_sched_policy(config, policy_name);
 		}
 		}
 		else if (arg_type == STARPU_SCHED_CTX_POLICY_STRUCT)
 		else if (arg_type == STARPU_SCHED_CTX_POLICY_STRUCT)
@@ -910,6 +997,19 @@ int fstarpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx
 			arg_i++;
 			arg_i++;
 			user_data = arglist[arg_i];
 			user_data = arglist[arg_i];
 		}
 		}
+		else if (arg_type == STARPU_SCHED_CTX_SUB_CTXS)
+		{
+			arg_i++;
+			sub_ctxs = (int*)arglist[arg_i]; 
+			arg_i++;
+			nsub_ctxs = *(int*)arglist[arg_i]; 
+		}
+		else if (arg_type == STARPU_SCHED_CTX_CUDA_NSMS)
+		{
+			arg_i++;
+			nsms = *(int*)arglist[arg_i]; 
+		}
+
 		else
 		else
 		{
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
@@ -933,7 +1033,7 @@ int fstarpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx
 	}
 	}
 
 
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	struct _starpu_sched_ctx *sched_ctx = NULL;
-	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers, init_sched, user_data);
+	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers, init_sched, user_data, nsub_ctxs, sub_ctxs, nsms);
 	sched_ctx->hierarchy_level = hierarchy_level;
 	sched_ctx->hierarchy_level = hierarchy_level;
 	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
 	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
 
 
@@ -1014,6 +1114,8 @@ static void _starpu_delete_sched_ctx(struct _starpu_sched_ctx *sched_ctx)
 void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	STARPU_ASSERT(sched_ctx);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 #ifdef STARPU_USE_SC_HYPERVISOR
 	if (sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS && sched_ctx->perf_counters != NULL)
 	if (sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS && sched_ctx->perf_counters != NULL)
 	{
 	{
@@ -1035,7 +1137,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
 
 	/*if both of them have all the ressources is pointless*/
 	/*if both of them have all the ressources is pointless*/
 	/*trying to transfer ressources from one ctx to the other*/
 	/*trying to transfer ressources from one ctx to the other*/
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	unsigned nworkers = config->topology.nworkers;
 	unsigned nworkers = config->topology.nworkers;
 
 
 	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
 	if(nworkers_ctx > 0 && inheritor_sched_ctx && inheritor_sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
@@ -1062,6 +1164,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 	   you don't use it anymore */
 	   you don't use it anymore */
 	free(workerids);
 	free(workerids);
 	_starpu_relock_mutex_if_prev_locked();
 	_starpu_relock_mutex_if_prev_locked();
+	occupied_sms -= sched_ctx->nsms;
 	return;
 	return;
 }
 }
 
 
@@ -1090,7 +1193,7 @@ void _starpu_delete_all_sched_ctxs()
 
 
 static void _starpu_check_workers(int *workerids, int nworkers)
 static void _starpu_check_workers(int *workerids, int nworkers)
 {
 {
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	int nworkers_conf = config->topology.nworkers;
 	int nworkers_conf = config->topology.nworkers;
 
 
 	int i;
 	int i;
@@ -1372,7 +1475,7 @@ int _starpu_wait_for_n_submitted_tasks_of_sched_ctx(unsigned sched_ctx_id, unsig
 
 
 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 {
 {
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 #ifndef STARPU_SANITIZE_THREAD
 #ifndef STARPU_SANITIZE_THREAD
 	if (!config->watchdog_ok)
 	if (!config->watchdog_ok)
 		config->watchdog_ok = 1;
 		config->watchdog_ok = 1;
@@ -1811,7 +1914,7 @@ unsigned starpu_sched_ctx_contains_type_of_worker(enum starpu_worker_archtype ar
 
 
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id)
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id)
 {
 {
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	int i;
 	int i;
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 	{
 	{
@@ -2092,7 +2195,8 @@ void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double ready_f
         _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, ready_flops);
         _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, ready_flops);
 }
 }
 
 
-void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex)
+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex, 
+				       unsigned with_repush)
 {
 {
 	/* TODO: make something cleaner which differentiates between calls
 	/* TODO: make something cleaner which differentiates between calls
 	   from push or pop (have mutex or not) and from another worker or not */
 	   from push or pop (have mutex or not) and from another worker or not */
@@ -2111,7 +2215,10 @@ void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_
 
 
 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 
 
-	_starpu_repush_task(j);
+	if(with_repush)
+		_starpu_repush_task(j);
+	else
+		_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops, j->task);
 
 
 	if(workerid != -1 && manage_mutex)
 	if(workerid != -1 && manage_mutex)
 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);

+ 14 - 2
src/core/sched_ctx.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011, 2013  INRIA
  * Copyright (C) 2011, 2013  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,7 +37,7 @@
 #define DO_RESIZE 1
 #define DO_RESIZE 1
 
 
 #define STARPU_GLOBAL_SCHED_CTX 0
 #define STARPU_GLOBAL_SCHED_CTX 0
-
+#define STARPU_NMAXSMS 13
 struct _starpu_sched_ctx
 struct _starpu_sched_ctx
 {
 {
 	/* id of the context used in user mode*/
 	/* id of the context used in user mode*/
@@ -174,6 +175,16 @@ struct _starpu_sched_ctx
 
 
 	/* function called when initializing the scheduler */
 	/* function called when initializing the scheduler */
 	void (*init_sched)(unsigned);
 	void (*init_sched)(unsigned);
+
+	int sub_ctxs[STARPU_NMAXWORKERS];
+	int nsub_ctxs;
+
+	/* nr of SMs assigned to this ctx if we partition gpus*/
+	int nsms;
+	int sms_start_idx;
+	int sms_end_idx;
+
+	int stream_worker;
 };
 };
 
 
 struct _starpu_machine_config;
 struct _starpu_machine_config;
@@ -184,7 +195,8 @@ void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config);
 /* allocate all structures belonging to a context */
 /* allocate all structures belonging to a context */
 struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name,
 struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name,
 						    int min_prio_set, int min_prio,
 						    int min_prio_set, int min_prio,
-						    int max_prio_set, int max_prio, unsigned awake_workers, void (*sched_policy_init)(unsigned), void *user_data);
+						    int max_prio_set, int max_prio, unsigned awake_workers, void (*sched_policy_init)(unsigned), void *user_data,
+							int nsub_ctxs, int *sub_ctxs, int nsms);
 
 
 /* delete all sched_ctx */
 /* delete all sched_ctx */
 void _starpu_delete_all_sched_ctxs();
 void _starpu_delete_all_sched_ctxs();

+ 59 - 8
src/core/sched_policy.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010-2016  CNRS
  * Copyright (C) 2010-2016  CNRS
  * Copyright (C) 2011, 2016  INRIA
  * Copyright (C) 2011, 2016  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,12 +32,14 @@ static double idle_start[STARPU_NMAXWORKERS];
 long _starpu_task_break_on_push = -1;
 long _starpu_task_break_on_push = -1;
 long _starpu_task_break_on_pop = -1;
 long _starpu_task_break_on_pop = -1;
 long _starpu_task_break_on_sched = -1;
 long _starpu_task_break_on_sched = -1;
+static const char *starpu_idle_file;
 
 
 void _starpu_sched_init(void)
 void _starpu_sched_init(void)
 {
 {
 	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", -1);
 	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", -1);
 	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
 	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
 	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", -1);
 	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", -1);
+	starpu_idle_file = starpu_getenv("STARPU_IDLE_FILE");
 }
 }
 
 
 int starpu_get_prefetch_flag(void)
 int starpu_get_prefetch_flag(void)
@@ -882,11 +885,12 @@ pick:
 
 
 	if (!task)
 	if (!task)
 	{
 	{
-		idle_start[worker->workerid] = starpu_timing_now();
+		if (starpu_idle_file)
+			idle_start[worker->workerid] = starpu_timing_now();
 		return NULL;
 		return NULL;
 	}
 	}
 
 
-	if(idle_start[worker->workerid] != 0.0)
+	if(starpu_idle_file && idle_start[worker->workerid] != 0.0)
 	{
 	{
 		double idle_end = starpu_timing_now();
 		double idle_end = starpu_timing_now();
 		idle[worker->workerid] += (idle_end - idle_start[worker->workerid]);
 		idle[worker->workerid] += (idle_end - idle_start[worker->workerid]);
@@ -1009,9 +1013,34 @@ void _starpu_sched_pre_exec_hook(struct starpu_task *task)
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook)
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook)
 	{
 	{
 		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
-		sched_ctx->sched_policy->pre_exec_hook(task);
+		sched_ctx->sched_policy->pre_exec_hook(task, sched_ctx_id);
 		_STARPU_TRACE_WORKER_SCHEDULING_POP;
 		_STARPU_TRACE_WORKER_SCHEDULING_POP;
 	}
 	}
+
+	if(!sched_ctx->sched_policy)
+	{
+		int workerid = starpu_worker_get_id();
+		struct _starpu_worker *worker =  _starpu_get_worker_struct(workerid);
+		struct _starpu_sched_ctx *other_sched_ctx;
+		struct _starpu_sched_ctx_elt *e = NULL;
+		struct _starpu_sched_ctx_list_iterator list_it;
+		
+		_starpu_sched_ctx_list_iterator_init(worker->sched_ctx_list, &list_it);
+		while (_starpu_sched_ctx_list_iterator_has_next(&list_it))
+		{
+			e = _starpu_sched_ctx_list_iterator_get_next(&list_it);
+			other_sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
+			if (other_sched_ctx != sched_ctx && 
+			    other_sched_ctx->sched_policy != NULL && 
+			    other_sched_ctx->sched_policy->pre_exec_hook)
+			{
+				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
+				other_sched_ctx->sched_policy->pre_exec_hook(task, other_sched_ctx->id);
+				_STARPU_TRACE_WORKER_SCHEDULING_POP;
+			}
+		}
+	}
+
 }
 }
 
 
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
@@ -1021,9 +1050,32 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->post_exec_hook)
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->post_exec_hook)
 	{
 	{
 		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
-		sched_ctx->sched_policy->post_exec_hook(task);
+		sched_ctx->sched_policy->post_exec_hook(task, sched_ctx_id);
 		_STARPU_TRACE_WORKER_SCHEDULING_POP;
 		_STARPU_TRACE_WORKER_SCHEDULING_POP;
 	}
 	}
+	if(!sched_ctx->sched_policy)
+	{
+		int workerid = starpu_worker_get_id();
+		struct _starpu_worker *worker =  _starpu_get_worker_struct(workerid);
+		struct _starpu_sched_ctx *other_sched_ctx;
+		struct _starpu_sched_ctx_elt *e = NULL;
+		struct _starpu_sched_ctx_list_iterator list_it;
+		
+		_starpu_sched_ctx_list_iterator_init(worker->sched_ctx_list, &list_it);
+		while (_starpu_sched_ctx_list_iterator_has_next(&list_it))
+		{
+			e = _starpu_sched_ctx_list_iterator_get_next(&list_it);
+			other_sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
+			if (other_sched_ctx != sched_ctx && 
+			    other_sched_ctx->sched_policy != NULL && 
+			    other_sched_ctx->sched_policy->post_exec_hook)
+			{
+				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
+				other_sched_ctx->sched_policy->post_exec_hook(task, other_sched_ctx->id);
+				_STARPU_TRACE_WORKER_SCHEDULING_POP;
+			}
+		}
+	}
 }
 }
 
 
 void _starpu_wait_on_sched_event(void)
 void _starpu_wait_on_sched_event(void)
@@ -1059,8 +1111,7 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int prio)
 
 
 void _starpu_print_idle_time()
 void _starpu_print_idle_time()
 {
 {
-	const char *sched_env = starpu_getenv("STARPU_IDLE_FILE");
-	if(!sched_env)
+	if(!starpu_idle_file)
 		return;
 		return;
 	double all_idle = 0.0;
 	double all_idle = 0.0;
 	int i = 0;
 	int i = 0;
@@ -1068,10 +1119,10 @@ void _starpu_print_idle_time()
 		all_idle += idle[i];
 		all_idle += idle[i];
 
 
 	FILE *f;
 	FILE *f;
-	f = fopen(sched_env, "a");
+	f = fopen(starpu_idle_file, "a");
 	if (!f)
 	if (!f)
 	{
 	{
-		fprintf(stderr, "couldn't open %s: %s\n", sched_env, strerror(errno));
+		fprintf(stderr, "couldn't open %s: %s\n", starpu_idle_file, strerror(errno));
 	}
 	}
 	else
 	else
 	{
 	{

+ 32 - 17
src/core/simgrid.c

@@ -227,10 +227,9 @@ struct main_args
 };
 };
 static int main_ret;
 static int main_ret;
 
 
-int do_starpu_main(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
+int do_starpu_main(int argc, char *argv[])
 {
 {
-	struct main_args *args = (void*) argv;
-	main_ret = starpu_main(args->argc, args->argv);
+	main_ret = starpu_main(argc, argv);
 	return main_ret;
 	return main_ret;
 }
 }
 
 
@@ -249,11 +248,12 @@ int main(int argc, char **argv)
 	start_simgrid(&argc, argv);
 	start_simgrid(&argc, argv);
 
 
 	/* Create a simgrid process for main */
 	/* Create a simgrid process for main */
-	struct main_args *args;
-	_STARPU_MALLOC(args, sizeof(*args));
-	args->argc = argc;
-	args->argv = argv;
-	MSG_process_create_with_arguments("main", &do_starpu_main, calloc(MAX_TSD, sizeof(void*)), MSG_get_host_by_name("MAIN"), 0, (char**) args);
+	char **argv_cpy;
+	_STARPU_MALLOC(argv_cpy, argc * sizeof(char*));
+	int i;
+	for (i = 0; i < argc; i++)
+		argv_cpy[i] = strdup(argv[i]);
+	MSG_process_create_with_arguments("main", &do_starpu_main, calloc(MAX_TSD+1, sizeof(void*)), MSG_get_host_by_name("MAIN"), argc, argv_cpy);
 
 
 	/* And run maestro in main thread */
 	/* And run maestro in main thread */
 	MSG_main();
 	MSG_main();
@@ -265,7 +265,7 @@ static void maestro(void *data STARPU_ATTRIBUTE_UNUSED)
 	MSG_main();
 	MSG_main();
 }
 }
 
 
-void _starpu_simgrid_init(int *argc, char ***argv)
+void _starpu_simgrid_init(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv STARPU_ATTRIBUTE_UNUSED)
 {
 {
 #ifdef HAVE_MSG_PROCESS_ATTACH
 #ifdef HAVE_MSG_PROCESS_ATTACH
 	if (!simgrid_started && !(smpi_main && smpi_simulated_main_ != _starpu_smpi_simulated_main_))
 	if (!simgrid_started && !(smpi_main && smpi_simulated_main_ != _starpu_smpi_simulated_main_))
@@ -336,9 +336,9 @@ struct task
 static struct task *last_task[STARPU_NMAXWORKERS];
 static struct task *last_task[STARPU_NMAXWORKERS];
 
 
 /* Actually execute the task.  */
 /* Actually execute the task.  */
-static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
+static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
 {
 {
-	struct task *task = (void*) argv;
+	struct task *task = starpu_pthread_getspecific(0);
 	_STARPU_DEBUG("task %p started\n", task);
 	_STARPU_DEBUG("task %p started\n", task);
 	MSG_task_execute(task->task);
 	MSG_task_execute(task->task);
 	MSG_task_destroy(task->task);
 	MSG_task_destroy(task->task);
@@ -354,7 +354,11 @@ static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
 	if (last_task[task->workerid] == task)
 	if (last_task[task->workerid] == task)
 		last_task[task->workerid] = NULL;
 		last_task[task->workerid] = NULL;
 	if (task->next)
 	if (task->next)
-		MSG_process_create_with_arguments("task", task_execute, calloc(MAX_TSD, sizeof(void*)), MSG_host_self(), 0, (char**) task->next);
+	{
+		void **tsd = calloc(MAX_TSD+1, sizeof(void*));
+		tsd[0] = task->next;
+		MSG_process_create_with_arguments("task", task_execute, tsd, MSG_host_self(), 0, NULL);
+	}
 	/* Task is freed with process context */
 	/* Task is freed with process context */
 	return 0;
 	return 0;
 }
 }
@@ -433,8 +437,11 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 		}
 		}
 		else
 		else
 		{
 		{
+			void **tsd;
 			last_task[workerid] = task;
 			last_task[workerid] = task;
-			MSG_process_create_with_arguments("task", task_execute, calloc(MAX_TSD, sizeof(void*)), MSG_host_self(), 0, (char**) task);
+			tsd = calloc(MAX_TSD+1, sizeof(void*));
+			tsd[0] = task;
+			MSG_process_create_with_arguments("task", task_execute, tsd, MSG_host_self(), 0, NULL);
 		}
 		}
 	}
 	}
 }
 }
@@ -517,9 +524,9 @@ static int transfers_are_sequential(struct transfer *new_transfer, struct transf
 }
 }
 
 
 /* Actually execute the transfer, and then start transfers waiting for this one.  */
 /* Actually execute the transfer, and then start transfers waiting for this one.  */
-static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
+static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
 {
 {
-	struct transfer *transfer = (void*) argv;
+	struct transfer *transfer = starpu_pthread_getspecific(0);
 	unsigned i;
 	unsigned i;
 	_STARPU_DEBUG("transfer %p started\n", transfer);
 	_STARPU_DEBUG("transfer %p started\n", transfer);
 	MSG_task_execute(transfer->task);
 	MSG_task_execute(transfer->task);
@@ -543,8 +550,11 @@ static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
 		wake->nwait--;
 		wake->nwait--;
 		if (!wake->nwait)
 		if (!wake->nwait)
 		{
 		{
+			void **tsd;
 			_STARPU_DEBUG("triggering transfer %p\n", wake);
 			_STARPU_DEBUG("triggering transfer %p\n", wake);
-			MSG_process_create_with_arguments("transfer task", transfer_execute, calloc(MAX_TSD, sizeof(void*)), _starpu_simgrid_get_host_by_name("MAIN"), 0, (char**) wake);
+			tsd = calloc(MAX_TSD+1, sizeof(void*));
+			tsd[0] = wake;
+			MSG_process_create_with_arguments("transfer task", transfer_execute, tsd, _starpu_simgrid_get_host_by_name("MAIN"), 0, NULL);
 		}
 		}
 	}
 	}
 
 
@@ -581,8 +591,11 @@ static void transfer_submit(struct transfer *transfer)
 
 
 	if (!transfer->nwait)
 	if (!transfer->nwait)
 	{
 	{
+		void **tsd;
 		_STARPU_DEBUG("transfer %p waits for nobody, starting\n", transfer);
 		_STARPU_DEBUG("transfer %p waits for nobody, starting\n", transfer);
-		MSG_process_create_with_arguments("transfer task", transfer_execute, calloc(MAX_TSD, sizeof(void*)), _starpu_simgrid_get_host_by_name("MAIN"), 0, (char**) transfer);
+		tsd = calloc(MAX_TSD+1, sizeof(void*));
+		tsd[0] = transfer;
+		MSG_process_create_with_arguments("transfer task", transfer_execute, tsd, _starpu_simgrid_get_host_by_name("MAIN"), 0, NULL);
 	}
 	}
 }
 }
 
 
@@ -706,6 +719,7 @@ _starpu_simgrid_get_memnode_host(unsigned node)
 
 
 void _starpu_simgrid_count_ngpus(void)
 void _starpu_simgrid_count_ngpus(void)
 {
 {
+#if defined(HAVE_SG_LINK_NAME) && SIMGRID_VERSION_MAJOR >= 4 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 13)
 	unsigned src, dst;
 	unsigned src, dst;
 	msg_host_t ramhost = _starpu_simgrid_get_host_by_name("RAM");
 	msg_host_t ramhost = _starpu_simgrid_get_host_by_name("RAM");
 
 
@@ -784,5 +798,6 @@ void _starpu_simgrid_count_ngpus(void)
 			_STARPU_DEBUG("%d->%d through %s, %u GPUs\n", src, dst, name, ngpus);
 			_STARPU_DEBUG("%d->%d through %s, %u GPUs\n", src, dst, name, ngpus);
 			starpu_bus_set_ngpus(busid, ngpus);
 			starpu_bus_set_ngpus(busid, ngpus);
 		}
 		}
+#endif
 }
 }
 #endif
 #endif

+ 21 - 11
src/core/task.c

@@ -4,6 +4,7 @@
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011, 2014, 2016  INRIA
  * Copyright (C) 2011, 2014, 2016  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -94,6 +95,7 @@ void starpu_task_init(struct starpu_task *task)
 
 
 	task->predicted = NAN;
 	task->predicted = NAN;
 	task->predicted_transfer = NAN;
 	task->predicted_transfer = NAN;
+	task->predicted_start = NAN;
 
 
 	task->magic = 42;
 	task->magic = 42;
 	task->sched_ctx = STARPU_NMAX_SCHED_CTXS;
 	task->sched_ctx = STARPU_NMAX_SCHED_CTXS;
@@ -231,6 +233,7 @@ int starpu_task_wait(struct starpu_task *task)
 
 
 	_STARPU_TRACE_TASK_WAIT_START(j);
 	_STARPU_TRACE_TASK_WAIT_START(j);
 
 
+	starpu_do_schedule();
 	_starpu_wait_job(j);
 	_starpu_wait_job(j);
 
 
 	/* as this is a synchronous task, the liberation of the job
 	/* as this is a synchronous task, the liberation of the job
@@ -843,7 +846,7 @@ int _starpu_task_wait_for_all_and_return_nb_waited_tasks(void)
 		_STARPU_DEBUG("Waiting for all tasks\n");
 		_STARPU_DEBUG("Waiting for all tasks\n");
 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_task_wait_for_all must not be called from a task or callback");
 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_task_wait_for_all must not be called from a task or callback");
 		STARPU_AYU_BARRIER();
 		STARPU_AYU_BARRIER();
-		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
 		if(config->topology.nsched_ctxs == 1)
 		if(config->topology.nsched_ctxs == 1)
 		{
 		{
 			_starpu_sched_do_schedule(0);
 			_starpu_sched_do_schedule(0);
@@ -916,7 +919,7 @@ int starpu_task_wait_for_n_submitted(unsigned n)
 		_STARPU_DEBUG("Waiting for all tasks\n");
 		_STARPU_DEBUG("Waiting for all tasks\n");
 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_task_wait_for_n_submitted must not be called from a task or callback");
 		STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_task_wait_for_n_submitted must not be called from a task or callback");
 
 
-		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
 		if(config->topology.nsched_ctxs == 1)
 		if(config->topology.nsched_ctxs == 1)
 			_starpu_wait_for_n_submitted_tasks_of_sched_ctx(0, n);
 			_starpu_wait_for_n_submitted_tasks_of_sched_ctx(0, n);
 		else
 		else
@@ -955,9 +958,12 @@ int starpu_task_wait_for_no_ready(void)
 {
 {
 	STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_task_wait_for_no_ready must not be called from a task or callback");
 	STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "starpu_task_wait_for_no_ready must not be called from a task or callback");
 
 
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	if(config->topology.nsched_ctxs == 1)
 	if(config->topology.nsched_ctxs == 1)
+	{
+		_starpu_sched_do_schedule(0);
 		_starpu_wait_for_no_ready_of_sched_ctx(0);
 		_starpu_wait_for_no_ready_of_sched_ctx(0);
+	}
 	else
 	else
 	{
 	{
 		int s;
 		int s;
@@ -965,6 +971,13 @@ int starpu_task_wait_for_no_ready(void)
 		{
 		{
 			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
 			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
 			{
 			{
+				_starpu_sched_do_schedule(config->sched_ctxs[s].id);
+			}
+		}
+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
+		{
+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+			{
 				_starpu_wait_for_no_ready_of_sched_ctx(config->sched_ctxs[s].id);
 				_starpu_wait_for_no_ready_of_sched_ctx(config->sched_ctxs[s].id);
 			}
 			}
 		}
 		}
@@ -975,7 +988,7 @@ int starpu_task_wait_for_no_ready(void)
 
 
 void starpu_do_schedule(void)
 void starpu_do_schedule(void)
 {
 {
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	if(config->topology.nsched_ctxs == 1)
 	if(config->topology.nsched_ctxs == 1)
 		_starpu_sched_do_schedule(0);
 		_starpu_sched_do_schedule(0);
 	else
 	else
@@ -1021,7 +1034,7 @@ starpu_drivers_request_termination(void)
 int starpu_task_nsubmitted(void)
 int starpu_task_nsubmitted(void)
 {
 {
 	int nsubmitted = 0;
 	int nsubmitted = 0;
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	if(config->topology.nsched_ctxs == 1)
 	if(config->topology.nsched_ctxs == 1)
 		nsubmitted = _starpu_get_nsubmitted_tasks_of_sched_ctx(0);
 		nsubmitted = _starpu_get_nsubmitted_tasks_of_sched_ctx(0);
 	else
 	else
@@ -1042,7 +1055,7 @@ int starpu_task_nsubmitted(void)
 int starpu_task_nready(void)
 int starpu_task_nready(void)
 {
 {
 	int nready = 0;
 	int nready = 0;
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	if(config->topology.nsched_ctxs == 1)
 	if(config->topology.nsched_ctxs == 1)
 		nready = starpu_sched_ctx_get_nready_tasks(0);
 		nready = starpu_sched_ctx_get_nready_tasks(0);
 	else
 	else
@@ -1200,7 +1213,7 @@ static void *watchdog_func(void *arg)
 #else
 #else
 	timeout = ((float) atoll(timeout_env)) / 1000000;
 	timeout = ((float) atoll(timeout_env)) / 1000000;
 #endif
 #endif
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	starpu_pthread_setname("watchdog");
 	starpu_pthread_setname("watchdog");
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&config->submitted_mutex);
@@ -1217,11 +1230,8 @@ static void *watchdog_func(void *arg)
 		{
 		{
 			starpu_sleep(1.);
 			starpu_sleep(1.);
 			if (!_starpu_machine_is_running())
 			if (!_starpu_machine_is_running())
-			{
 				/* Application finished, don't bother finishing the sleep */
 				/* Application finished, don't bother finishing the sleep */
-				STARPU_PTHREAD_MUTEX_UNLOCK(&config->submitted_mutex);
 				return NULL;
 				return NULL;
-			}
 		}
 		}
 		/* and one final sleep (of less than 1 s) with the rest (if needed) */
 		/* and one final sleep (of less than 1 s) with the rest (if needed) */
 		if (t > 0.)
 		if (t > 0.)
@@ -1249,7 +1259,7 @@ static void *watchdog_func(void *arg)
 
 
 void _starpu_watchdog_init(void)
 void _starpu_watchdog_init(void)
 {
 {
-	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	char *timeout_env = starpu_getenv("STARPU_WATCHDOG_TIMEOUT");
 	char *timeout_env = starpu_getenv("STARPU_WATCHDOG_TIMEOUT");
 
 
 	STARPU_PTHREAD_MUTEX_INIT(&config->submitted_mutex, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&config->submitted_mutex, NULL);

+ 35 - 10
src/core/topology.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016 CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016 CNRS
  * Copyright (C) 2011, 2016  INRIA
  * Copyright (C) 2011, 2016  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -92,11 +93,6 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned workerid;
 	unsigned workerid;
 
 
-#ifdef STARPU_USE_CUDA
-	if (d->type == STARPU_CUDA_WORKER)
-		return &cuda_worker_set[d->id.cuda_id];
-#endif
-
 	for (workerid = 0; workerid < nworkers; workerid++)
 	for (workerid = 0; workerid < nworkers; workerid++)
 	{
 	{
 		if (starpu_worker_get_type(workerid) == d->type)
 		if (starpu_worker_get_type(workerid) == d->type)
@@ -121,6 +117,16 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 				break;
 				break;
 			}
 			}
 #endif
 #endif
+#ifdef STARPU_USE_CUDA
+			case STARPU_CUDA_WORKER:
+			{
+				if (worker->devid == d->id.cuda_id)
+					return worker->set;
+				break;
+
+			}
+#endif
+
 			default:
 			default:
 				_STARPU_DEBUG("Invalid device type\n");
 				_STARPU_DEBUG("Invalid device type\n");
 				return NULL;
 				return NULL;
@@ -1199,17 +1205,28 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
 
 	_starpu_initialize_workers_cuda_gpuid(config);
 	_starpu_initialize_workers_cuda_gpuid(config);
 
 
+	/* allow having one worker per stream */
+	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
+
 	unsigned cudagpu;
 	unsigned cudagpu;
 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
 	{
 	{
 		int devid = _starpu_get_next_cuda_gpuid(config);
 		int devid = _starpu_get_next_cuda_gpuid(config);
 		int worker_idx0 = topology->nworkers + cudagpu * nworker_per_cuda;
 		int worker_idx0 = topology->nworkers + cudagpu * nworker_per_cuda;
 		cuda_worker_set[devid].workers = &config->workers[worker_idx0];
 		cuda_worker_set[devid].workers = &config->workers[worker_idx0];
+
 		for (i = 0; i < nworker_per_cuda; i++)
 		for (i = 0; i < nworker_per_cuda; i++)
 		{
 		{
 			int worker_idx = worker_idx0 + i;
 			int worker_idx = worker_idx0 + i;
+			if(th_per_stream)
+			{
+				/* Just one worker in the set */
+				config->workers[worker_idx].set = (struct _starpu_worker_set *)calloc(1, sizeof(struct _starpu_worker_set));
+				config->workers[worker_idx].set->workers = &config->workers[worker_idx];
+			}
+			else
+				config->workers[worker_idx].set = &cuda_worker_set[devid];
 
 
-			config->workers[worker_idx].set = &cuda_worker_set[devid];
 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
 			_STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
 			_STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
 			config->workers[worker_idx].perf_arch.ndevices = 1;
 			config->workers[worker_idx].perf_arch.ndevices = 1;
@@ -1224,9 +1241,13 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 			config->worker_mask |= STARPU_CUDA;
 			config->worker_mask |= STARPU_CUDA;
 
 
 			struct handle_entry *entry;
 			struct handle_entry *entry;
-			_STARPU_MALLOC(entry, sizeof(*entry));
-			entry->gpuid = devid;
-			HASH_ADD_INT(devices_using_cuda, gpuid, entry);
+			HASH_FIND_INT(devices_using_cuda, &devid, entry);
+			if (!entry)
+			{
+				_STARPU_MALLOC(entry, sizeof(*entry));
+				entry->gpuid = devid;
+				HASH_ADD_INT(devices_using_cuda, gpuid, entry);
+			}
 		}
 		}
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
@@ -1689,6 +1710,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
 	unsigned cuda_bindid[STARPU_MAXCUDADEVS];
 	unsigned cuda_bindid[STARPU_MAXCUDADEVS];
+	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
 #endif
 #endif
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 	unsigned opencl_init[STARPU_MAXOPENCLDEVS] = { };
 	unsigned opencl_init[STARPU_MAXOPENCLDEVS] = { };
@@ -1777,7 +1799,10 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				{
 				{
 					memory_node = cuda_memory_nodes[devid];
 					memory_node = cuda_memory_nodes[devid];
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-					workerarg->bindid = cuda_bindid[devid];
+					if (th_per_stream == 0)
+						workerarg->bindid = cuda_bindid[devid];
+					else
+						workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 #endif /* SIMGRID */
 #endif /* SIMGRID */
 				}
 				}
 				else
 				else

+ 110 - 16
src/core/workers.c

@@ -5,6 +5,7 @@
  * Copyright (C) 2010, 2011  INRIA
  * Copyright (C) 2010, 2011  INRIA
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011-2012, 2016  INRIA
  * Copyright (C) 2011-2012, 2016  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,7 +55,7 @@
 static starpu_pthread_mutex_t init_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static starpu_pthread_mutex_t init_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static starpu_pthread_cond_t init_cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_cond_t init_cond = STARPU_PTHREAD_COND_INITIALIZER;
 static int init_count = 0;
 static int init_count = 0;
-static enum { UNINITIALIZED, CHANGING, INITIALIZED } initialized = UNINITIALIZED;
+static enum initialization initialized = UNINITIALIZED;
 
 
 int _starpu_keys_initialized STARPU_ATTRIBUTE_INTERNAL;
 int _starpu_keys_initialized STARPU_ATTRIBUTE_INTERNAL;
 starpu_pthread_key_t _starpu_worker_key STARPU_ATTRIBUTE_INTERNAL;
 starpu_pthread_key_t _starpu_worker_key STARPU_ATTRIBUTE_INTERNAL;
@@ -388,7 +389,7 @@ int starpu_worker_can_execute_task_first_impl(unsigned workerid, struct starpu_t
 	{
 	{
 		for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
 		for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
 			if (_starpu_can_use_nth_implementation(arch, cl, i)
 			if (_starpu_can_use_nth_implementation(arch, cl, i)
-			 && task->cl->can_execute(workerid, task, i))
+			 && (task->cl->can_execute(workerid, task, i)))
 			{
 			{
 				if (nimpl)
 				if (nimpl)
 					*nimpl = i;
 					*nimpl = i;
@@ -643,6 +644,10 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 #endif
 	STARPU_AYU_INIT();
 	STARPU_AYU_INIT();
 
 
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+	unsigned th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", 0);
+#endif
+
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
@@ -693,21 +698,24 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
 				driver.id.cuda_id = devid;
 				driver.id.cuda_id = devid;
 
 
-				/* We spawn only one thread per CUDA driver,
-				 * which will control all CUDA workers of this
-				 * driver. (by using a worker set). */
 				if (worker_set->workers != workerarg)
 				if (worker_set->workers != workerarg)
+					/* We are not the first worker of the
+					 * set, don't start a thread for it. */
 					break;
 					break;
 
 
-				worker_set->nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
-
-#ifndef STARPU_NON_BLOCKING_DRIVERS
-				if (worker_set->nworkers > 1)
+				if(th_per_stream == 0)
 				{
 				{
-					_STARPU_DISP("Warning: reducing STARPU_NWORKER_PER_CUDA to 1 because blocking drivers are enabled\n");
-					worker_set->nworkers = 1;
-				}
+					worker_set->nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+					if (worker_set->nworkers > 1)
+					{
+						_STARPU_DISP("Warning: reducing STARPU_NWORKER_PER_CUDA to 1 because blocking drivers are enabled\n");
+						worker_set->nworkers = 1;
+					}
 #endif
 #endif
+				}
+				else
+					worker_set->nworkers = 1;
 
 
 				worker_set->set_is_initialized = 0;
 				worker_set->set_is_initialized = 0;
 
 
@@ -717,6 +725,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 					break;
 					break;
 				}
 				}
 
 
+
 				STARPU_PTHREAD_CREATE_ON(
 				STARPU_PTHREAD_CREATE_ON(
 					workerarg->name,
 					workerarg->name,
 					&worker_set->worker_thread,
 					&worker_set->worker_thread,
@@ -1372,11 +1381,14 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	if (!is_a_sink)
 	if (!is_a_sink)
 	{
 	{
 		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&_starpu_config, _starpu_config.conf.sched_policy_name);
 		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&_starpu_config, _starpu_config.conf.sched_policy_name);
-		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_min_priority, (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_max_priority, 1, _starpu_config.conf.sched_policy_init, NULL);
+		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_min_priority, (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_max_priority, 1, _starpu_config.conf.sched_policy_init, NULL,  0, NULL, 0);
 	}
 	}
 
 
 	_starpu_initialize_registered_performance_models();
 	_starpu_initialize_registered_performance_models();
 
 
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+	_starpu_cuda_init();
+#endif
 	/* Launch "basic" workers (ie. non-combined workers) */
 	/* Launch "basic" workers (ie. non-combined workers) */
 	if (!is_a_sink)
 	if (!is_a_sink)
 		_starpu_launch_drivers(&_starpu_config);
 		_starpu_launch_drivers(&_starpu_config);
@@ -1434,7 +1446,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 
 
 		/* in case StarPU termination code is called from a callback,
 		/* in case StarPU termination code is called from a callback,
  		 * we have to check if pthread_self() is the worker itself */
  		 * we have to check if pthread_self() is the worker itself */
-		if (set)
+		if (set && set->nworkers > 0)
 		{
 		{
 			if (set->started)
 			if (set->started)
 			{
 			{
@@ -1442,7 +1454,9 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 				status = starpu_pthread_join(set->worker_thread, NULL);
 				status = starpu_pthread_join(set->worker_thread, NULL);
 #else
 #else
 				if (!pthread_equal(pthread_self(), set->worker_thread))
 				if (!pthread_equal(pthread_self(), set->worker_thread))
+				{
 					status = starpu_pthread_join(set->worker_thread, NULL);
 					status = starpu_pthread_join(set->worker_thread, NULL);
+				}
 #endif
 #endif
 				if (status)
 				if (status)
 				{
 				{
@@ -1927,11 +1941,11 @@ enum starpu_worker_archtype starpu_worker_get_type(int id)
 	return _starpu_config.workers[id].arch;
 	return _starpu_config.workers[id].arch;
 }
 }
 
 
-int starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
+unsigned starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, unsigned maxsize)
 {
 {
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
 
 
-	int cnt = 0;
+	unsigned cnt = 0;
 
 
 	unsigned id;
 	unsigned id;
 	for (id = 0; id < nworkers; id++)
 	for (id = 0; id < nworkers; id++)
@@ -1983,6 +1997,48 @@ int starpu_worker_get_by_devid(enum starpu_worker_archtype type, int devid)
 	return -1;
 	return -1;
 }
 }
 
 
+int starpu_worker_get_devids(enum starpu_worker_archtype type, int *devids, int num)
+{
+	int cnt = 0;
+	unsigned nworkers = starpu_worker_get_count();
+	int workerids[nworkers];
+
+	unsigned ndevice_workers = starpu_worker_get_ids_by_type(type, workerids, nworkers);
+
+	unsigned ndevids = 0;
+
+	if(ndevice_workers > 0)
+	{
+		unsigned id, devid;
+		int curr_devid = -1;
+		unsigned found = 0;
+		for(id = 0; id < ndevice_workers; id++)
+		{
+			curr_devid = _starpu_config.workers[workerids[id]].devid;
+			for(devid = 0; devid < ndevids; devid++)
+			{
+				if(curr_devid == devids[devid])
+				{
+					found = 1;
+					break;
+				}
+			}
+			if(!found)
+			{
+				devids[ndevids++] = curr_devid;
+				cnt++;
+			}
+			else
+				found = 0;
+
+			if(cnt == num)
+				break;
+		}
+	}
+
+	return ndevids;
+}
+
 void starpu_worker_get_name(int id, char *dst, size_t maxlen)
 void starpu_worker_get_name(int id, char *dst, size_t maxlen)
 {
 {
 	char *name = _starpu_config.workers[id].name;
 	char *name = _starpu_config.workers[id].name;
@@ -2003,6 +2059,19 @@ int starpu_bindid_get_workerids(int bindid, int **workerids)
 	return _starpu_config.bindid_workers[bindid].nworkers;
 	return _starpu_config.bindid_workers[bindid].nworkers;
 }
 }
 
 
+int starpu_worker_get_stream_workerids(unsigned devid, int *workerids, enum starpu_worker_archtype type)
+{
+	unsigned nworkers = starpu_worker_get_count();
+	int nw = 0;
+	unsigned id;
+	for (id = 0; id < nworkers; id++)
+	{
+		if (_starpu_config.workers[id].devid == devid && _starpu_config.workers[id].arch == type)
+			workerids[nw++] = id;
+	}
+	return nw;
+}
+
 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond)
 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond)
 {
 {
 	*sched_cond = &_starpu_config.workers[workerid].sched_cond;
 	*sched_cond = &_starpu_config.workers[workerid].sched_cond;
@@ -2284,3 +2353,28 @@ char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
 	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
 	return "STARPU_unknown_WORKER";
 	return "STARPU_unknown_WORKER";
 }
 }
+
+void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *sched_ctx)
+{
+	STARPU_ASSERT(workerid < starpu_worker_get_count());
+        struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+        w->stream_ctx = sched_ctx;
+}
+
+struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid)
+{
+	if (stream_workerid >= starpu_worker_get_count())
+		return NULL;
+        struct _starpu_worker *w = _starpu_get_worker_struct(stream_workerid);
+        return w->stream_ctx;
+}
+
+unsigned starpu_worker_get_sched_ctx_id_stream(unsigned stream_workerid)
+{
+	if (stream_workerid >= starpu_worker_get_count())
+		return STARPU_NMAX_SCHED_CTXS;
+        struct _starpu_worker *w = _starpu_get_worker_struct(stream_workerid);
+	return w->stream_ctx != NULL ? w->stream_ctx->id : STARPU_NMAX_SCHED_CTXS;
+}
+
+

+ 9 - 0
src/core/workers.h

@@ -3,6 +3,7 @@
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2016  INRIA
  * Copyright (C) 2011, 2016  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -58,6 +59,8 @@
 
 
 #define STARPU_MAX_PIPELINE 4
 #define STARPU_MAX_PIPELINE 4
 
 
+enum initialization { UNINITIALIZED = 0, CHANGING, INITIALIZED };
+
 /* This is initialized from in _starpu_worker_init */
 /* This is initialized from in _starpu_worker_init */
 LIST_TYPE(_starpu_worker,
 LIST_TYPE(_starpu_worker,
 	struct _starpu_machine_config *config;
 	struct _starpu_machine_config *config;
@@ -136,6 +139,8 @@ LIST_TYPE(_starpu_worker,
 	/* bool to indicate if the worker is slave in a ctx */
 	/* bool to indicate if the worker is slave in a ctx */
 	unsigned is_slave_somewhere;
 	unsigned is_slave_somewhere;
 
 
+	struct _starpu_sched_ctx *stream_ctx;
+
 #ifdef __GLIBC__
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;
 	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */
 #endif /* __GLIBC__ */
@@ -602,4 +607,8 @@ static inline unsigned __starpu_worker_get_id_check(const char *f, int l)
 }
 }
 #define _starpu_worker_get_id_check(f,l) __starpu_worker_get_id_check(f,l)
 #define _starpu_worker_get_id_check(f,l) __starpu_worker_get_id_check(f,l)
 
 
+void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *sched_ctx);
+
+struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);
+
 #endif // __WORKERS_H__
 #endif // __WORKERS_H__

+ 1 - 1
src/datawizard/coherency.c

@@ -740,7 +740,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 		/* Take references which will be released by _starpu_release_data_on_node */
 		/* Take references which will be released by _starpu_release_data_on_node */
 		if (dst_replicate)
 		if (dst_replicate)
 			dst_replicate->refcnt++;
 			dst_replicate->refcnt++;
-		else if (node == STARPU_ACQUIRE_ALL_NODES)
+		else if (node == STARPU_ACQUIRE_NO_NODE_LOCK_ALL)
 		{
 		{
 			int i;
 			int i;
 			for (i = 0; i < STARPU_MAXNODES; i++)
 			for (i = 0; i < STARPU_MAXNODES; i++)

+ 7 - 0
src/datawizard/filters.c

@@ -195,6 +195,8 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 	}
 	}
 
 
+	_starpu_data_unregister_ram_pointer(initial_handle);
+
 	for (i = 0; i < nparts; i++)
 	for (i = 0; i < nparts; i++)
 	{
 	{
 		starpu_data_handle_t child;
 		starpu_data_handle_t child;
@@ -341,6 +343,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned node;
 	unsigned node;
 	unsigned sizes[root_handle->nchildren];
 	unsigned sizes[root_handle->nchildren];
+	void *ptr;
 
 
 	_STARPU_TRACE_START_UNPARTITION(root_handle, gathering_node);
 	_STARPU_TRACE_START_UNPARTITION(root_handle, gathering_node);
 	_starpu_spin_lock(&root_handle->header_lock);
 	_starpu_spin_lock(&root_handle->header_lock);
@@ -428,6 +431,10 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 		_starpu_memory_stats_free(child_handle);
 		_starpu_memory_stats_free(child_handle);
 	}
 	}
 
 
+	ptr = starpu_data_handle_to_pointer(root_handle, STARPU_MAIN_RAM);
+	if (ptr != NULL)
+		_starpu_data_register_ram_pointer(root_handle, ptr);
+
 	/* the gathering_node should now have a valid copy of all the children.
 	/* the gathering_node should now have a valid copy of all the children.
 	 * For all nodes, if the node had all copies and none was locally
 	 * For all nodes, if the node had all copies and none was locally
 	 * allocated then the data is still valid there, else, it's invalidated
 	 * allocated then the data is still valid there, else, it's invalidated

+ 30 - 14
src/datawizard/interfaces/data_interface.c

@@ -147,7 +147,7 @@ struct starpu_data_interface_ops *_starpu_data_interface_get_ops(unsigned interf
  * some handle, the new mapping shadows the previous one.   */
  * some handle, the new mapping shadows the previous one.   */
 void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
 void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
 {
 {
-	struct handle_entry *entry;
+	struct handle_entry *entry, *old_entry;
 
 
 	_STARPU_MALLOC(entry, sizeof(*entry));
 	_STARPU_MALLOC(entry, sizeof(*entry));
 
 
@@ -174,11 +174,19 @@ void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
 #endif
 #endif
 	{
 	{
 		_starpu_spin_lock(&registered_handles_lock);
 		_starpu_spin_lock(&registered_handles_lock);
-		nregistered++;
-		if (nregistered > maxnregistered)
-			maxnregistered = nregistered;
-		HASH_ADD_PTR(registered_handles, pointer, entry);
-		_starpu_spin_unlock(&registered_handles_lock);
+		HASH_FIND_PTR(registered_handles, &ptr, old_entry);
+		if (old_entry) {
+			/* Already registered this pointer, avoid undefined
+			 * behavior of duplicate in hash table */
+			_starpu_spin_unlock(&registered_handles_lock);
+			free(entry);
+		} else {
+			nregistered++;
+			if (nregistered > maxnregistered)
+				maxnregistered = nregistered;
+			HASH_ADD_PTR(registered_handles, pointer, entry);
+			_starpu_spin_unlock(&registered_handles_lock);
+		}
 	}
 	}
 }
 }
 
 
@@ -544,9 +552,17 @@ void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
 
 
 			_starpu_spin_lock(&registered_handles_lock);
 			_starpu_spin_lock(&registered_handles_lock);
 			HASH_FIND_PTR(registered_handles, &ram_ptr, entry);
 			HASH_FIND_PTR(registered_handles, &ram_ptr, entry);
-			STARPU_ASSERT(entry != NULL);
-			nregistered--;
-			HASH_DEL(registered_handles, entry);
+			if (entry)
+			{
+				if (entry->handle == handle)
+				{
+					nregistered--;
+					HASH_DEL(registered_handles, entry);
+				}
+				else
+					/* don't free it, it's not ours */
+					entry = NULL;
+			}
 			_starpu_spin_unlock(&registered_handles_lock);
 			_starpu_spin_unlock(&registered_handles_lock);
 		}
 		}
 		free(entry);
 		free(entry);
@@ -908,7 +924,7 @@ static void _starpu_data_unregister_submit_cb(void *arg)
 	STARPU_ASSERT(handle->busy_count);
 	STARPU_ASSERT(handle->busy_count);
         _starpu_spin_unlock(&handle->header_lock);
         _starpu_spin_unlock(&handle->header_lock);
 
 
-	starpu_data_release_on_node(handle, STARPU_ACQUIRE_ALL_NODES);
+	starpu_data_release_on_node(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL);
 }
 }
 
 
 void starpu_data_unregister_submit(starpu_data_handle_t handle)
 void starpu_data_unregister_submit(starpu_data_handle_t handle)
@@ -922,7 +938,7 @@ void starpu_data_unregister_submit(starpu_data_handle_t handle)
 	}
 	}
 
 
 	/* Wait for all task dependencies on this handle before putting it for free */
 	/* Wait for all task dependencies on this handle before putting it for free */
-	starpu_data_acquire_on_node_cb(handle, STARPU_ACQUIRE_ALL_NODES, STARPU_RW, _starpu_data_unregister_submit_cb, handle);
+	starpu_data_acquire_on_node_cb(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_RW, _starpu_data_unregister_submit_cb, handle);
 }
 }
 
 
 static void _starpu_data_invalidate(void *data)
 static void _starpu_data_invalidate(void *data)
@@ -980,14 +996,14 @@ static void _starpu_data_invalidate(void *data)
 
 
 	_starpu_spin_unlock(&handle->header_lock);
 	_starpu_spin_unlock(&handle->header_lock);
 
 
-	starpu_data_release_on_node(handle, STARPU_ACQUIRE_ALL_NODES);
+	starpu_data_release_on_node(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL);
 }
 }
 
 
 void starpu_data_invalidate(starpu_data_handle_t handle)
 void starpu_data_invalidate(starpu_data_handle_t handle)
 {
 {
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT(handle);
 
 
-	starpu_data_acquire_on_node(handle, STARPU_ACQUIRE_ALL_NODES, STARPU_W);
+	starpu_data_acquire_on_node(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_W);
 
 
 	_starpu_data_invalidate(handle);
 	_starpu_data_invalidate(handle);
 
 
@@ -998,7 +1014,7 @@ void starpu_data_invalidate_submit(starpu_data_handle_t handle)
 {
 {
 	STARPU_ASSERT(handle);
 	STARPU_ASSERT(handle);
 
 
-	starpu_data_acquire_on_node_cb(handle, STARPU_ACQUIRE_ALL_NODES, STARPU_W, _starpu_data_invalidate, handle);
+	starpu_data_acquire_on_node_cb(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_W, _starpu_data_invalidate, handle);
 
 
 	handle->initialized = 0;
 	handle->initialized = 0;
 }
 }

+ 2 - 1
src/datawizard/malloc.c

@@ -137,7 +137,8 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
 			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
 	}
 	}
 
 
-	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0 && config->conf.ncuda != 0)
 	{
 	{
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 		/* FIXME: CUDA seems to be taking 650µs every 1MiB.
 		/* FIXME: CUDA seems to be taking 650µs every 1MiB.

+ 3 - 3
src/datawizard/user_interactions.c

@@ -344,7 +344,7 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 	else
 	else
 	{
 	{
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
-		if (node == STARPU_ACQUIRE_ALL_NODES)
+		if (node == STARPU_ACQUIRE_NO_NODE_LOCK_ALL)
 		{
 		{
 			int i;
 			int i;
 			for (i = 0; i < STARPU_MAXNODES; i++)
 			for (i = 0; i < STARPU_MAXNODES; i++)
@@ -499,14 +499,14 @@ static void _starpu_data_wont_use(void *data)
 		}
 		}
 	}
 	}
 	_starpu_spin_unlock(&handle->header_lock);
 	_starpu_spin_unlock(&handle->header_lock);
-	starpu_data_release_on_node(handle, STARPU_ACQUIRE_ALL_NODES);
+	starpu_data_release_on_node(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL);
 	if (handle->home_node != -1)
 	if (handle->home_node != -1)
 		starpu_data_idle_prefetch_on_node(handle, handle->home_node, 1);
 		starpu_data_idle_prefetch_on_node(handle, handle->home_node, 1);
 }
 }
 
 
 void starpu_data_wont_use(starpu_data_handle_t handle)
 void starpu_data_wont_use(starpu_data_handle_t handle)
 {
 {
-	starpu_data_acquire_on_node_cb(handle, STARPU_ACQUIRE_ALL_NODES, STARPU_R, _starpu_data_wont_use, handle);
+	starpu_data_acquire_on_node_cb(handle, STARPU_ACQUIRE_NO_NODE_LOCK_ALL, STARPU_R, _starpu_data_wont_use, handle);
 }
 }
 
 
 /*
 /*

+ 72 - 23
src/drivers/cuda/driver_cuda.c

@@ -4,6 +4,7 @@
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -70,6 +71,26 @@ static starpu_pthread_mutex_t task_mutex[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE
 static starpu_pthread_cond_t task_cond[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
 static starpu_pthread_cond_t task_cond[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
+static enum initialization cuda_device_init[STARPU_MAXCUDADEVS];
+static int cuda_device_users[STARPU_MAXCUDADEVS];
+static starpu_pthread_mutex_t cuda_device_init_mutex[STARPU_MAXCUDADEVS];
+static starpu_pthread_cond_t cuda_device_init_cond[STARPU_MAXCUDADEVS];
+
+void _starpu_cuda_init(void)
+{
+	unsigned i;
+	for (i = 0; i < STARPU_MAXCUDADEVS; i++)
+	{
+		STARPU_PTHREAD_MUTEX_INIT(&cuda_device_init_mutex[i], NULL);
+		STARPU_PTHREAD_COND_INIT(&cuda_device_init_cond[i], NULL);
+	}
+}
+
+static size_t _starpu_cuda_get_global_mem_size(unsigned devid)
+{
+	return global_mem[devid];
+}
+
 void
 void
 _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 {
 {
@@ -244,17 +265,34 @@ done:
 #endif
 #endif
 }
 }
 
 
-#ifndef STARPU_SIMGRID
-static void init_device_context(unsigned devid)
+static void init_device_context(unsigned devid, unsigned memnode)
 {
 {
 	unsigned i;
 	unsigned i;
 
 
+#ifndef STARPU_SIMGRID
 	cudaError_t cures;
 	cudaError_t cures;
 
 
 	/* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
 	/* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
 
 
 	starpu_cuda_set_device(devid);
 	starpu_cuda_set_device(devid);
+#endif /* !STARPU_SIMGRID */
 
 
+	STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
+	cuda_device_users[devid]++;
+	if (cuda_device_init[devid] == UNINITIALIZED)
+		/* Nobody started initialization yet, do it */
+		cuda_device_init[devid] = CHANGING;
+	else
+	{
+		/* Somebody else is doing initialization, wait for it */
+		while (cuda_device_init[devid] != INITIALIZED)
+			STARPU_PTHREAD_COND_WAIT(&cuda_device_init_cond[devid], &cuda_device_init_mutex[devid]);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
+		return;
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
+
+#ifndef STARPU_SIMGRID
 #ifdef HAVE_CUDA_MEMCPY_PEER
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
 	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
 	{
 	{
@@ -322,9 +360,17 @@ static void init_device_context(unsigned devid)
 		if (STARPU_UNLIKELY(cures))
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
 	}
-}
 #endif /* !STARPU_SIMGRID */
 #endif /* !STARPU_SIMGRID */
 
 
+	STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
+	cuda_device_init[devid] = INITIALIZED;
+	STARPU_PTHREAD_COND_BROADCAST(&cuda_device_init_cond[devid]);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
+
+	_starpu_cuda_limit_gpu_mem_if_needed(devid);
+	_starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
+}
+
 static void init_worker_context(unsigned workerid)
 static void init_worker_context(unsigned workerid)
 {
 {
 	int j;
 	int j;
@@ -384,11 +430,6 @@ static void deinit_worker_context(unsigned workerid)
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 }
 }
 
 
-static size_t _starpu_cuda_get_global_mem_size(unsigned devid)
-{
-	return global_mem[devid];
-}
-
 
 
 /* Return the number of devices usable in the system.
 /* Return the number of devices usable in the system.
  * The value returned cannot be greater than MAXCUDADEVS */
  * The value returned cannot be greater than MAXCUDADEVS */
@@ -597,9 +638,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 			/* Already initialized */
 			/* Already initialized */
 			continue;
 			continue;
 		lastdevid = devid;
 		lastdevid = devid;
-#ifndef STARPU_SIMGRID
-		init_device_context(devid);
-#endif
+		init_device_context(devid, memnode);
 
 
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 		STARPU_ASSERT_MSG(worker_set->nworkers == 1, "Simgrid mode does not support concurrent kernel execution yet\n");
 		STARPU_ASSERT_MSG(worker_set->nworkers == 1, "Simgrid mode does not support concurrent kernel execution yet\n");
@@ -607,9 +646,6 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 		if (worker_set->nworkers > 1 && props[devid].concurrentKernels == 0)
 		if (worker_set->nworkers > 1 && props[devid].concurrentKernels == 0)
 			_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
 			_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
 #endif /* !STARPU_SIMGRID */
 #endif /* !STARPU_SIMGRID */
-
-		_starpu_cuda_limit_gpu_mem_if_needed(devid);
-		_starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
 	}
 	}
 
 
 	/* one more time to avoid hacks from third party lib :) */
 	/* one more time to avoid hacks from third party lib :) */
@@ -845,23 +881,36 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 		struct _starpu_worker *worker = &worker_set->workers[i];
 		struct _starpu_worker *worker = &worker_set->workers[i];
 		unsigned devid = worker->devid;
 		unsigned devid = worker->devid;
 		unsigned memnode = worker->memory_node;
 		unsigned memnode = worker->memory_node;
+		unsigned usersleft;
 		if ((int) devid == lastdevid)
 		if ((int) devid == lastdevid)
 			/* Already initialized */
 			/* Already initialized */
 			continue;
 			continue;
 		lastdevid = devid;
 		lastdevid = devid;
 
 
-		_starpu_handle_all_pending_node_data_requests(memnode);
-
-		/* In case there remains some memory that was automatically
-		 * allocated by StarPU, we release it now. Note that data
-		 * coherency is not maintained anymore at that point ! */
-		_starpu_free_all_automatically_allocated_buffers(memnode);
-
-		_starpu_malloc_shutdown(memnode);
+		STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
+		usersleft = --cuda_device_users[devid];
+		STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
+
+		if (!usersleft)
+                {
+			/* I'm last, deinitialize device */
+			_starpu_handle_all_pending_node_data_requests(memnode);
+			
+			/* In case there remains some memory that was automatically
+			 * allocated by StarPU, we release it now. Note that data
+			 * coherency is not maintained anymore at that point ! */
+			_starpu_free_all_automatically_allocated_buffers(memnode);
+			
+			_starpu_malloc_shutdown(memnode);
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
-		deinit_device_context(devid);
+			deinit_device_context(devid);
 #endif /* !STARPU_SIMGRID */
 #endif /* !STARPU_SIMGRID */
+                }
+		STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
+		cuda_device_init[devid] = UNINITIALIZED;
+		STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
+
 	}
 	}
 
 
 	for (i = 0; i < worker_set->nworkers; i++)
 	for (i = 0; i < worker_set->nworkers; i++)

+ 1 - 0
src/drivers/cuda/driver_cuda.h

@@ -34,6 +34,7 @@
 
 
 #include <common/fxt.h>
 #include <common/fxt.h>
 
 
+void _starpu_cuda_init(void);
 unsigned _starpu_get_cuda_device_count(void);
 unsigned _starpu_get_cuda_device_count(void);
 extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
 extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
 
 

+ 2 - 3
src/drivers/gordon/driver_gordon.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2015  Université de Bordeaux
  * Copyright (C) 2009-2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2013, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -352,7 +352,6 @@ void *gordon_worker_inject(struct _starpu_worker_set *arg)
 		else
 		else
 		{
 		{
 #ifndef NOCHAIN
 #ifndef NOCHAIN
-			int ret = 0;
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #warning we should look into the local job list here !
 #warning we should look into the local job list here !
 #endif
 #endif
@@ -401,7 +400,7 @@ void *gordon_worker_inject(struct _starpu_worker_set *arg)
 						chunk_list = list;
 						chunk_list = list;
 					}
 					}
 
 
-					ret = inject_task_list(chunk_list, &arg->workers[0]);
+					inject_task_list(chunk_list, &arg->workers[0]);
 				}
 				}
 			}
 			}
 			else
 			else

+ 2 - 4
src/sched_policies/component_worker.c

@@ -841,14 +841,13 @@ int starpu_sched_component_worker_get_workerid(struct starpu_sched_component * w
 	return starpu_bitmap_first(worker_component->workers);
 	return starpu_bitmap_first(worker_component->workers);
 }
 }
 
 
-void starpu_sched_component_worker_pre_exec_hook(struct starpu_task * task)
+void starpu_sched_component_worker_pre_exec_hook(struct starpu_task * task, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	double model = task->predicted;
 	double model = task->predicted;
 	double transfer_model = task->predicted_transfer;
 	double transfer_model = task->predicted_transfer;
 
 
 	if(!isnan(task->predicted) || !isnan(task->predicted_transfer))
 	if(!isnan(task->predicted) || !isnan(task->predicted_transfer))
 	{
 	{
-		unsigned sched_ctx_id = task->sched_ctx;
 		struct _starpu_worker_task_list * list = _worker_get_list(sched_ctx_id);
 		struct _starpu_worker_task_list * list = _worker_get_list(sched_ctx_id);
 		STARPU_PTHREAD_MUTEX_LOCK(&list->mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&list->mutex);
 
 
@@ -875,11 +874,10 @@ void starpu_sched_component_worker_pre_exec_hook(struct starpu_task * task)
 	}
 	}
 }
 }
 
 
-void starpu_sched_component_worker_post_exec_hook(struct starpu_task * task)
+void starpu_sched_component_worker_post_exec_hook(struct starpu_task * task, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	if(task->execute_on_a_specific_worker)
 	if(task->execute_on_a_specific_worker)
 		return;
 		return;
-	unsigned sched_ctx_id = task->sched_ctx;
 	struct _starpu_worker_task_list * list = _worker_get_list(sched_ctx_id);
 	struct _starpu_worker_task_list * list = _worker_get_list(sched_ctx_id);
 	STARPU_PTHREAD_MUTEX_LOCK(&list->mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&list->mutex);
 	list->exp_start = starpu_timing_now();
 	list->exp_start = starpu_timing_now();

+ 87 - 48
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -4,6 +4,7 @@
  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011-2012, 2016  INRIA
  * Copyright (C) 2011-2012, 2016  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -124,6 +125,63 @@ static int _normalize_prio(int priority, int num_priorities, unsigned sched_ctx_
 	return ((num_priorities-1)/(max-min)) * (priority - min);
 	return ((num_priorities-1)/(max-min)) * (priority - min);
 }
 }
 
 
+/* This is called when a transfer request is actually pushed to the worker */
+static void _starpu_fifo_task_transfer_started(struct _starpu_fifo_taskq *fifo, struct starpu_task *task, int num_priorities)
+{
+	double transfer_model = task->predicted_transfer;
+	if (isnan(transfer_model))
+		return;
+
+	/* We now start the transfer, move it from predicted to pipelined */
+	fifo->exp_len -= transfer_model;
+	fifo->pipeline_len += transfer_model;
+	fifo->exp_start = starpu_timing_now() + fifo->pipeline_len;
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
+	if(num_priorities != -1)
+	{
+		int i;
+		int task_prio = _normalize_prio(task->priority, num_priorities, task->sched_ctx);
+		for(i = 0; i <= task_prio; i++)
+			fifo->exp_len_per_priority[i] -= transfer_model;
+	}
+}
+
+/* This is called when a task is actually pushed to the worker (i.e. the transfer finished */
+static void _starpu_fifo_task_started(struct _starpu_fifo_taskq *fifo, struct starpu_task *task, int num_priorities)
+{
+	double model = task->predicted;
+	double transfer_model = task->predicted_transfer;
+	if(!isnan(transfer_model))
+		/* The transfer is over, remove it from pipelined */
+		fifo->pipeline_len -= transfer_model;
+
+	if(!isnan(model))
+	{
+		/* We now start the computation, move it from predicted to pipelined */
+		fifo->exp_len -= model;
+		fifo->pipeline_len += model;
+		fifo->exp_start = starpu_timing_now() + fifo->pipeline_len;
+                fifo->exp_end= fifo->exp_start + fifo->exp_len;
+		if(num_priorities != -1)
+		{
+			int i;
+			int task_prio = _normalize_prio(task->priority, num_priorities, task->sched_ctx);
+			for(i = 0; i <= task_prio; i++)
+				fifo->exp_len_per_priority[i] -= model;
+		}
+	}
+}
+
+/* This is called when a task is actually finished */
+static void _starpu_fifo_task_finished(struct _starpu_fifo_taskq *fifo, struct starpu_task *task, int num_priorities STARPU_ATTRIBUTE_UNUSED)
+{
+	if(!isnan(task->predicted))
+		/* The execution is over, remove it from pipelined */
+		fifo->pipeline_len -= task->predicted;
+}
+
+
+
 static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned node, int num_priorities)
 static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned node, int num_priorities)
 {
 {
 	struct starpu_task *task = NULL, *current;
 	struct starpu_task *task = NULL, *current;
@@ -196,6 +254,8 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 	task = _starpu_fifo_pop_first_ready_task(fifo, node, dt->num_priorities);
 	task = _starpu_fifo_pop_first_ready_task(fifo, node, dt->num_priorities);
 	if (task)
 	if (task)
 	{
 	{
+		_starpu_fifo_task_transfer_started(fifo, task, dt->num_priorities);
+
 		starpu_sched_ctx_list_task_counters_decrement(sched_ctx_id, workerid);
 		starpu_sched_ctx_list_task_counters_decrement(sched_ctx_id, workerid);
 
 
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
@@ -230,8 +290,10 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 	task = _starpu_fifo_pop_local_task(fifo);
 	task = _starpu_fifo_pop_local_task(fifo);
 	if (task)
 	if (task)
 	{
 	{
-		starpu_sched_ctx_list_task_counters_decrement(sched_ctx_id, workerid);
+		_starpu_fifo_task_transfer_started(fifo, task, dt->num_priorities);
 
 
+		starpu_sched_ctx_list_task_counters_decrement(sched_ctx_id, workerid);
+		  
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 		if (task->cl)
 		if (task->cl)
 		{
 		{
@@ -251,7 +313,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 {
 {
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 
 
-	struct starpu_task *new_list;
+	struct starpu_task *new_list, *task;
 
 
 	unsigned workerid = starpu_worker_get_id_check();
 	unsigned workerid = starpu_worker_get_id_check();
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
@@ -268,6 +330,9 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
 
 	starpu_sched_ctx_list_task_counters_reset(sched_ctx_id, workerid);
 	starpu_sched_ctx_list_task_counters_reset(sched_ctx_id, workerid);
 
 
+	for (task = new_list; task; task = task->next)
+		_starpu_fifo_task_transfer_started(fifo, task, dt->num_priorities);
+
 	return new_list;
 	return new_list;
 }
 }
 
 
@@ -282,7 +347,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 
         if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
         if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
         {
         {
-                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx, 0);
+                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx, 0, 1);
 		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
 		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
                 return 0;
                 return 0;
         }
         }
@@ -300,7 +365,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
 
 
         /* Sometimes workers didn't take the tasks as early as we expected */
         /* Sometimes workers didn't take the tasks as early as we expected */
-	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
+	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 
 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
@@ -362,6 +427,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	}
 	}
 
 
 	STARPU_AYU_ADDTOTASKQUEUE(_starpu_get_job_associated_to_task(task)->job_id, best_workerid);
 	STARPU_AYU_ADDTOTASKQUEUE(_starpu_get_job_associated_to_task(task)->job_id, best_workerid);
+	unsigned stream_ctx_id = starpu_worker_get_sched_ctx_id_stream(best_workerid);
+	if(stream_ctx_id != STARPU_NMAX_SCHED_CTXS)
+	{
+		starpu_sched_ctx_move_task_to_ctx(task, stream_ctx_id, 0, 0);
+		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
+	}
+
 	int ret = 0;
 	int ret = 0;
 	if (prio)
 	if (prio)
 	{
 	{
@@ -434,7 +506,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker, sched_ctx_id);
 
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
 		/* Sometimes workers didn't take the tasks as early as we expected */
-		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
+		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
 
 
 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
 			continue;
 			continue;
@@ -583,7 +655,8 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
 		/* Sometimes workers didn't take the tasks as early as we expected */
-		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
+		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
+
 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
 			continue;
 			continue;
 
 
@@ -852,8 +925,6 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 	}
 	}
 	else
 	else
 	{
 	{
-//		double max_len = (max_exp_end - starpu_timing_now());
-		/* printf("%d: dmda max_exp_end %lf best_exp_end %lf max_len %lf \n", sched_ctx_id, max_exp_end/1000000.0, best_exp_end/1000000.0, max_len/1000000.0);	 */
 		return exp_end[best_in_ctx][selected_impl] ;
 		return exp_end[best_in_ctx][selected_impl] ;
 	}
 	}
 }
 }
@@ -1022,14 +1093,11 @@ static void deinitialize_dmda_policy(unsigned sched_ctx_id)
 /* dmda_pre_exec_hook is called right after the data transfer is done and right
 /* dmda_pre_exec_hook is called right after the data transfer is done and right
  * before the computation to begin, it is useful to update more precisely the
  * before the computation to begin, it is useful to update more precisely the
  * value of the expected start, end, length, etc... */
  * value of the expected start, end, length, etc... */
-static void dmda_pre_exec_hook(struct starpu_task *task)
+static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 {
 {
-	unsigned sched_ctx_id = starpu_sched_ctx_get_ctx_for_task(task);
 	unsigned workerid = starpu_worker_get_id_check();
 	unsigned workerid = starpu_worker_get_id_check();
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
-	double model = task->predicted;
-	double transfer_model = task->predicted_transfer;
 
 
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_cond_t *sched_cond;
 	starpu_pthread_cond_t *sched_cond;
@@ -1039,40 +1107,11 @@ static void dmda_pre_exec_hook(struct starpu_task *task)
 	 * of work. */
 	 * of work. */
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
 
 
-	/* Take the opportunity to update start time */
-	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
+	_starpu_fifo_task_started(fifo, task, dt->num_priorities);
 
 
-	if(!isnan(transfer_model))
-	{
-		/* The transfer is over, get rid of it in the completion
-		 * prediction */
-		fifo->exp_len -= transfer_model;
-		if(dt->num_priorities != -1)
-		{
-			int i;
-			int task_prio = _normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
-			for(i = 0; i <= task_prio; i++)
-				fifo->exp_len_per_priority[i] -= transfer_model;
-		}
-
-	}
-
-	if(!isnan(model))
-	{
-		/* We now start the computation, get rid of it in the completion
-		 * prediction */
-		fifo->exp_len -= model;
-		fifo->exp_start += model;
-		if(dt->num_priorities != -1)
-		{
-			int i;
-			int task_prio = _normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
-			for(i = 0; i <= task_prio; i++)
-				fifo->exp_len_per_priority[i] -= model;
-		}
-	}
+	/* Take the opportunity to update start time */
+	fifo->exp_start = STARPU_MAX(starpu_timing_now() + fifo->pipeline_len, fifo->exp_start);
 
 
-	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 }
 }
 
 
@@ -1096,7 +1135,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 	/* Update the predictions */
 	/* Update the predictions */
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
 	/* Sometimes workers didn't take the tasks as early as we expected */
 	/* Sometimes workers didn't take the tasks as early as we expected */
-	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
+	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 
 	/* If there is no prediction available, we consider the task has a null length */
 	/* If there is no prediction available, we consider the task has a null length */
@@ -1155,9 +1194,8 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 }
 }
 
 
-static void dmda_post_exec_hook(struct starpu_task * task)
+static void dmda_post_exec_hook(struct starpu_task * task, unsigned sched_ctx_id)
 {
 {
-	unsigned sched_ctx_id = starpu_sched_ctx_get_ctx_for_task(task);
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	unsigned workerid = starpu_worker_get_id_check();
 	unsigned workerid = starpu_worker_get_id_check();
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
@@ -1165,7 +1203,8 @@ static void dmda_post_exec_hook(struct starpu_task * task)
 	starpu_pthread_cond_t *sched_cond;
 	starpu_pthread_cond_t *sched_cond;
 	starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
 	starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
-	fifo->exp_start = starpu_timing_now();
+	_starpu_fifo_task_finished(fifo, task, dt->num_priorities);
+	fifo->exp_start = STARPU_MAX(starpu_timing_now() + fifo->pipeline_len, fifo->exp_start);
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 }
 }

+ 2 - 1
src/sched_policies/eager_central_policy.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010-2013, 2016  CNRS
  * Copyright (C) 2010-2013, 2016  CNRS
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -182,7 +183,7 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 		{
 		{
-			starpu_sched_ctx_move_task_to_ctx(chosen_task, child_sched_ctx, 1);
+			starpu_sched_ctx_move_task_to_ctx(chosen_task, child_sched_ctx, 1, 1);
 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, chosen_task->flops);
 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, chosen_task->flops);
 			return NULL;
 			return NULL;
 		}
 		}

+ 2 - 1
src/sched_policies/eager_central_priority_policy.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -289,7 +290,7 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
                 unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
                 unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 		{
 		{
-			starpu_sched_ctx_move_task_to_ctx(chosen_task, child_sched_ctx, 1);
+			starpu_sched_ctx_move_task_to_ctx(chosen_task, child_sched_ctx, 1, 1);
 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, chosen_task->flops);
 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, chosen_task->flops);
 			return NULL;
 			return NULL;
 		}
 		}

+ 2 - 0
src/sched_policies/fifo_queues.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2013, 2016  CNRS
  * Copyright (C) 2010, 2011, 2013, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -56,6 +57,7 @@ struct _starpu_fifo_taskq *_starpu_create_fifo(void)
 	fifo->exp_len = 0.0;
 	fifo->exp_len = 0.0;
 	fifo->exp_end = fifo->exp_start;
 	fifo->exp_end = fifo->exp_start;
 	fifo->exp_len_per_priority = NULL;
 	fifo->exp_len_per_priority = NULL;
+	fifo->pipeline_len = 0.0;
 
 
 	return fifo;
 	return fifo;
 }
 }

+ 2 - 0
src/sched_policies/fifo_queues.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2013, 2016  Université de Bordeaux
  * Copyright (C) 2010-2013, 2016  Université de Bordeaux
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,6 +43,7 @@ struct _starpu_fifo_taskq
 	double exp_end; /* Expected end date of last task in the queue */
 	double exp_end; /* Expected end date of last task in the queue */
 	double exp_len; /* Expected duration of the set of tasks in the queue */
 	double exp_len; /* Expected duration of the set of tasks in the queue */
 	double *exp_len_per_priority; /* Expected duration of the set of tasks in the queue corresponding to each priority */
 	double *exp_len_per_priority; /* Expected duration of the set of tasks in the queue corresponding to each priority */
+	double pipeline_len; /* the expected duration of what is already pushed to the worker */
 };
 };
 
 
 struct _starpu_fifo_taskq*_starpu_create_fifo(void) STARPU_ATTRIBUTE_MALLOC;
 struct _starpu_fifo_taskq*_starpu_create_fifo(void) STARPU_ATTRIBUTE_MALLOC;

+ 2 - 1
src/sched_policies/heteroprio.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2015  INRIA
  * Copyright (C) 2015  INRIA
  * Copyright (C) 2016  CNRS
  * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016  Uppsala University
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -609,7 +610,7 @@ done:		;
 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 		{
 		{
-			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx, 1);
+			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx, 1, 1);
 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
 			return NULL;
 			return NULL;
 		}
 		}

+ 2 - 2
src/sched_policies/parallel_heft.c

@@ -72,7 +72,7 @@ static int ntasks[STARPU_NMAXWORKERS];
   from the workers available to the program, and not to the context !!!!!!!!!!!!!!!!!!!!!!!
   from the workers available to the program, and not to the context !!!!!!!!!!!!!!!!!!!!!!!
 */
 */
 
 
-static void parallel_heft_pre_exec_hook(struct starpu_task *task)
+static void parallel_heft_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	if (!task->cl || task->execute_on_a_specific_worker)
 	if (!task->cl || task->execute_on_a_specific_worker)
 		return;
 		return;
@@ -521,7 +521,7 @@ static void parallel_heft_add_workers(__attribute__((unused)) unsigned sched_ctx
 	_starpu_sched_find_worker_combinations(workerids, nworkers);
 	_starpu_sched_find_worker_combinations(workerids, nworkers);
 
 
 // start_unclear_part: not very clear where this is used
 // start_unclear_part: not very clear where this is used
-/* 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config(); */
+/* 	struct _starpu_machine_config *config = _starpu_get_machine_config(); */
 /* 	ncombinedworkers = config->topology.ncombinedworkers; */
 /* 	ncombinedworkers = config->topology.ncombinedworkers; */
 
 
 /* 	/\* We pre-compute an array of all the perfmodel archs that are applicable *\/ */
 /* 	/\* We pre-compute an array of all the perfmodel archs that are applicable *\/ */

+ 3 - 1
src/sched_policies/work_stealing_policy.c

@@ -60,12 +60,14 @@
 #define MAX_LOCALITY 8
 #define MAX_LOCALITY 8
 
 
 /* Entry for queued_tasks_per_data: records that a queued task is accessing the data with locality flag */
 /* Entry for queued_tasks_per_data: records that a queued task is accessing the data with locality flag */
+#ifdef USE_LOCALITY_TASKS
 struct locality_entry
 struct locality_entry
 {
 {
 	UT_hash_handle hh;
 	UT_hash_handle hh;
 	starpu_data_handle_t data;
 	starpu_data_handle_t data;
 	struct starpu_task *task;
 	struct starpu_task *task;
 };
 };
+#endif
 
 
 struct _starpu_work_stealing_data_per_worker
 struct _starpu_work_stealing_data_per_worker
 {
 {
@@ -722,7 +724,7 @@ static int lws_select_victim(struct _starpu_work_stealing_data *ws, unsigned sch
 	{
 	{
 		int neighbor = ws->per_worker[workerid].proxlist[i];
 		int neighbor = ws->per_worker[workerid].proxlist[i];
 		int ntasks = ws->per_worker[neighbor].queue_array->ntasks;
 		int ntasks = ws->per_worker[neighbor].queue_array->ntasks;
-		if (ntasks && ws->per_worker[workerid].busy)
+		if (ntasks && ws->per_worker[neighbor].busy)
 			return neighbor;
 			return neighbor;
 	}
 	}
 	return -1;
 	return -1;

+ 1 - 1
src/util/fstarpu.c

@@ -521,7 +521,7 @@ int fstarpu_worker_get_count_by_type(intptr_t type)
 	return starpu_worker_get_count_by_type((enum starpu_worker_archtype)type);
 	return starpu_worker_get_count_by_type((enum starpu_worker_archtype)type);
 }
 }
 
 
-int fstarpu_worker_get_ids_by_type(intptr_t type, int *workerids, int maxsize)
+unsigned fstarpu_worker_get_ids_by_type(intptr_t type, int *workerids, unsigned maxsize)
 {
 {
 	return starpu_worker_get_ids_by_type((enum starpu_worker_archtype)type, workerids, maxsize);
 	return starpu_worker_get_ids_by_type((enum starpu_worker_archtype)type, workerids, maxsize);
 }
 }

+ 1 - 1
src/util/openmp_runtime_support.h

@@ -376,7 +376,7 @@ struct starpu_omp_global
 	struct starpu_omp_thread *hash_workers;
 	struct starpu_omp_thread *hash_workers;
 	struct _starpu_spinlock hash_workers_lock;
 	struct _starpu_spinlock hash_workers_lock;
 	struct starpu_arbiter *default_arbiter;
 	struct starpu_arbiter *default_arbiter;
-	int nb_starpu_cpu_workers;
+	unsigned nb_starpu_cpu_workers;
 	int *starpu_cpu_worker_ids;
 	int *starpu_cpu_worker_ids;
 };
 };
 
 

+ 1 - 1
src/util/openmp_runtime_support_environment.c

@@ -539,7 +539,6 @@ static void free_places(struct starpu_omp_place *places)
 
 
 static void read_proc_bind_var()
 static void read_proc_bind_var()
 {
 {
-	static const char *strings[] = { "false", "true", "master", "close", "spread", NULL };
 	const int max_levels = _initial_icv_values.max_active_levels_var + 1;
 	const int max_levels = _initial_icv_values.max_active_levels_var + 1;
 	int *bind_list = NULL;
 	int *bind_list = NULL;
 	char *env;
 	char *env;
@@ -549,6 +548,7 @@ static void read_proc_bind_var()
 	env = starpu_getenv("OMP_PROC_BIND");
 	env = starpu_getenv("OMP_PROC_BIND");
 	if (env)
 	if (env)
 	{
 	{
+		static const char *strings[] = { "false", "true", "master", "close", "spread", NULL };
 		char *saveptr, *token;
 		char *saveptr, *token;
 		int level = 0;
 		int level = 0;
 
 

+ 1 - 0
tests/datawizard/dsm_stress.c

@@ -236,6 +236,7 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 	}
 
 
+	starpu_do_schedule();
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (!finished)
 	if (!finished)
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);

+ 1 - 0
tests/datawizard/sync_with_data_with_mem_non_blocking_implicit.c

@@ -137,6 +137,7 @@ int main(int argc, char **argv)
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
 		}
 		}
 
 
+		starpu_do_schedule();
 		/* Wait for all buffers to be available */
 		/* Wait for all buffers to be available */
 		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 

+ 1 - 0
tests/main/driver_api/init_run_deinit.c

@@ -48,6 +48,7 @@ run(struct starpu_task *task, struct starpu_driver *d)
 {
 {
 	int ret;
 	int ret;
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
+	starpu_do_schedule();
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	while (!starpu_task_finished(task))
 	while (!starpu_task_finished(task))
 	{
 	{

+ 1 - 0
tests/main/regenerate.c

@@ -114,6 +114,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
+	starpu_do_schedule();
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (!completed)
 	if (!completed)
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);

+ 1 - 0
tests/main/regenerate_pipeline.c

@@ -141,6 +141,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
+	starpu_do_schedule();
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	while (completed < 3)
 	while (completed < 3)
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
 		STARPU_PTHREAD_COND_WAIT(&cond, &mutex);

+ 1 - 0
tests/main/subgraph_repeat.c

@@ -147,6 +147,7 @@ int main(int argc, char **argv)
 	ret = starpu_task_submit(&taskC); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskC); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskD); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskD); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
+	starpu_do_schedule();
 	/* Wait for the termination of all loops */
 	/* Wait for the termination of all loops */
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (loop_cnt < niter)
 	if (loop_cnt < niter)

+ 1 - 0
tests/main/subgraph_repeat_regenerate.c

@@ -167,6 +167,7 @@ int main(int argc, char **argv)
 	ret = starpu_task_submit(&taskC); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskC); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskD); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskD); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
+	starpu_do_schedule();
 	/* Wait for the termination of all loops */
 	/* Wait for the termination of all loops */
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	while (loop_cntD < niter)
 	while (loop_cntD < niter)

+ 1 - 0
tests/main/subgraph_repeat_regenerate_tag.c

@@ -206,6 +206,7 @@ int main(int argc, char **argv)
 
 
 	starpu_tag_notify_from_apps((starpu_tag_t) TAG_START);
 	starpu_tag_notify_from_apps((starpu_tag_t) TAG_START);
 
 
+	starpu_do_schedule();
 	/* Wait for the termination of all loops */
 	/* Wait for the termination of all loops */
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (loop_cnt < niter)
 	if (loop_cnt < niter)

+ 1 - 0
tests/main/subgraph_repeat_tag.c

@@ -176,6 +176,7 @@ int main(int argc, char **argv)
 	ret = starpu_task_submit(&taskC); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskC); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskD); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(&taskD); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
+	starpu_do_schedule();
 	/* Wait for the termination of all loops */
 	/* Wait for the termination of all loops */
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (loop_cnt < niter)
 	if (loop_cnt < niter)

+ 8 - 1
tests/microbenchs/tasks_size_overhead.c

@@ -91,7 +91,7 @@ static struct starpu_codelet codelet =
 static void parse_args(int argc, char **argv)
 static void parse_args(int argc, char **argv)
 {
 {
 	int c;
 	int c;
-	while ((c = getopt(argc, argv, "i:b:B:c:C:t:T:f:h")) != -1)
+	while ((c = getopt(argc, argv, "i:b:B:c:C:s:t:T:f:h")) != -1)
 	switch(c)
 	switch(c)
 	{
 	{
 		case 'i':
 		case 'i':
@@ -171,6 +171,13 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 	starpu_shutdown();
 #endif
 #endif
 
 
+#ifdef STARPU_HAVE_UNSETENV
+	/* That was useful to force the max number of cpus to use, but now we
+	 * want to make it vary */
+	unsetenv("STARPU_NCPUS");
+	unsetenv("STARPU_NCPU");
+#endif
+
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
 	float *buffers[total_nbuffers?total_nbuffers:1];
 	float *buffers[total_nbuffers?total_nbuffers:1];

+ 2 - 0
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -263,6 +263,8 @@ main(void)
 #ifdef STARPU_HAVE_UNSETENV
 #ifdef STARPU_HAVE_UNSETENV
 	unsetenv("STARPU_SCHED");
 	unsetenv("STARPU_SCHED");
 #endif
 #endif
+	if (starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1) != 1)
+		return STARPU_TEST_SKIPPED;
 
 
 	int i;
 	int i;
 	int n_policies = sizeof(policies)/sizeof(policies[0]);
 	int n_policies = sizeof(policies)/sizeof(policies[0]);