8 years ago · 15103b9723
--- a/doc/doxygen/chapters/330_scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/330_scheduling_contexts.doxy
@@ -96,6 +96,36 @@ int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", STARPU_SCHED_CTX_PO
 
																 /* .... */
															
 
																 \endcode
															
 
																+\section CreatingAContext Creating A Context To Partition a GPU
															
 
																+
															
 
																+The contexts can also be used to group set of SMs of an NVIDIA GPU in order to isolate
															
 
																+the parallel kernels and allow them to coexecution on a specified partiton of the GPU.
															
 
																+
															
 
																+Each context will be mapped to a stream and the user can indicate the number of SMs.
															
 
																+The context can be added to a larger context already grouping CPU cores. 
															
 
																+This larger context can use a scheduling policy that assigns tasks to both CPUs and contexts (partitions of the GPU)
															
 
																+based on performance models adjusted to the number of SMs.
															
 
																+
															
 
																+The GPU implementation of the task has to be modified accordingly and receive as a parameter the number of SMs.
															
 
																+
															
 
																+\code{.c}
															
 
																+/* get the available streams (suppose we have nstreams = 2 by specifying them with STARPU_NWORKER_PER_CUDA=2  */
															
 
																+int nstreams = starpu_worker_get_stream_workerids(gpu_devid, stream_workerids, STARPU_CUDA_WORKER);
															
 
																+
															
 
																+int sched_ctx[nstreams];
															
 
																+sched_ctx[0] = starpu_sched_ctx_create(&stream_workerids[0], 1, "subctx",  STARPU_SCHED_CTX_CUDA_NSMS, 6, 0);
															
 
																+sched_ctx[1] = starpu_sched_ctx_create(&stream_workerids[1], 1, "subctx",  STARPU_SCHED_CTX_CUDA_NSMS, 7, 0);
															
 
																+
															
 
																+int ncpus = 4;
															
 
																+int workers[ncpus+nstreams];
															
 
																+workers[ncpus+0] = stream_workerids[0];
															
 
																+workers[ncpus+1] = stream_workerids[1];
															
 
																+
															
 
																+big_sched_ctx = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0); 
															
 
																+
															
 
																+starpu_task_submit_to_ctx(task, big_sched_ctx);
															
 
																+
															
 
																+\endcode
															
 
																 \section ModifyingAContext Modifying A Context
															
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -51,6 +51,15 @@ Specify the number of workers per CUDA device, and thus the number of kernels
 
																 which will be concurrently running on the devices. The default value is 1.
															
 
																 </dd>
															
 
																+<dt>STARPU_NWORKER_PER_CUDA</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_ONE_THREAD_PER_STREAM
															
 
																+\addindex __env__STARPU_ONE_THREAD_PER_STREAM
															
 
																+Specify if the cuda driver should provide a thread per stream or a single thread 
															
 
																+dealing with all the streams. 0 if one thread per stream, 1 otherwise. The default 
															
 
																+value is 1.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>STARPU_CUDA_PIPELINE</dt>
															
 
																 <dd>
															
 
																 \anchor STARPU_CUDA_PIPELINE
															
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -106,6 +106,17 @@ function pointer allowing to initialize the scheduling policy.
 
																 This macro is used when calling starpu_sched_ctx_create() to specify a
															
 
																 pointer to some user data related to the context being created.
															
 
																+\def STARPU_SCHED_CTX_SUB_CTXS
															
 
																+\ingroup API_Scheduling_Contexts
															
 
																+This macro is used when calling starpu_sched_ctx_create() to specify 
															
 
																+a list of sub contextes of the current context.
															
 
																+
															
 
																+\def STARPU_SCHED_CTX_CUDA_NSMS
															
 
																+\ingroup API_Scheduling_Contexts
															
 
																+This macro is used when calling starpu_sched_ctx_create() in order
															
 
																+to create a context on the NVIDIA GPU to specify the number of SMs
															
 
																+the context should have
															
 
																+
															
 
																 \fn unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_ctx_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap)
															
 
																 \ingroup API_Scheduling_Contexts
															
 
																 Create a context indicating an approximate interval of resources
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -73,7 +73,10 @@ EXTRA_DIST = 					\
 
																 	reductions/dot_product_opencl_kernels.cl	\
															
 
																 	scheduler/schedulers.sh				\
															
 
																 	scheduler/schedulers_context.sh			\
															
 
																-	fortran/Makefile
															
 
																+	fortran/Makefile				\
															
 
																+	sched_ctx/axpy_partition_gpu.h				\
															
 
																+	sched_ctx/axpy_partition_gpu.cu		
															
 
																+
															
 
																 CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
															
@@ -138,7 +141,8 @@ noinst_HEADERS = 				\
 
																 	pi/SobolQRNG/sobol_gpu.h		\
															
 
																 	pi/SobolQRNG/sobol_primitives.h         \
															
 
																 	reductions/dot_product.h                \
															
 
																-	basic_examples/vector_scal_cpu_template.h
															
 
																+	basic_examples/vector_scal_cpu_template.h \
															
 
																+	sched_ctx/axpy_partition_gpu.h				
															
 
																 #####################################
															
 
																 # What to install and what to check #
															
@@ -229,7 +233,8 @@ STARPU_EXAMPLES +=				\
 
																 	sched_ctx/dummy_sched_with_ctx		\
															
 
																 	worker_collections/worker_tree_example  \
															
 
																 	reductions/dot_product			\
															
 
																-	reductions/minmax_reduction
															
 
																+	reductions/minmax_reduction		\
															
 
																+	sched_ctx/gpu_partition
															
 
																 endif
															
@@ -337,6 +342,14 @@ endif
 
																 endif !STARPU_SIMGRID
															
 
																+sched_ctx_gpu_partition_SOURCES =		\
															
 
																+	sched_ctx/gpu_partition.c
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+sched_ctx_gpu_partition_SOURCES +=		\
															
 
																+	sched_ctx/axpy_partition_gpu.cu
															
 
																+endif
															
 
																+
															
 
																 ##################
															
 
																 # Basic examples #
															
 
																 ##################
															
@@ -851,7 +864,7 @@ endif
 
																 cpp_add_vectors_SOURCES	=	\
															
 
																 	cpp/add_vectors.cpp
															
 
																-	
															
 
																+
															
 
																 if STARPU_HAVE_CXX11
															
 
																 cpp_add_vectors_cpp11_SOURCES	=	\
															
 
																 	cpp/add_vectors_cpp11.cpp
															
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -33,6 +33,8 @@ extern "C"
 
																 #define STARPU_SCHED_CTX_AWAKE_WORKERS           (7<<16)
															
 
																 #define STARPU_SCHED_CTX_POLICY_INIT             (8<<16)
															
 
																 #define STARPU_SCHED_CTX_USER_DATA               (9<<16)
															
 
																+#define STARPU_SCHED_CTX_CUDA_NSMS               (10<<16)
															
 
																+#define STARPU_SCHED_CTX_SUB_CTXS                (11<<16)
															
 
																 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
															
@@ -157,7 +159,7 @@ unsigned starpu_sched_ctx_master_get_context(int masterid);
 
																 void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
															
 
																-void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex);
															
 
																+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex, unsigned with_repush);
															
 
																 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
															
@@ -168,6 +170,10 @@ unsigned starpu_sched_ctx_has_starpu_scheduler(unsigned sched_ctx_id, unsigned *
 
																 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
															
 
																 #endif /* STARPU_USE_SC_HYPERVISOR */
															
 
																+int starpu_sched_ctx_get_stream_worker(unsigned sub_ctx);
															
 
																+int starpu_sched_ctx_get_nsms(unsigned sched_ctx);
															
 
																+void starpu_sched_ctx_get_sms_interval(int stream_workerid, int *start, int *end);
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -39,8 +39,8 @@ struct starpu_sched_policy
 
																 	struct starpu_task *(*pop_every_task)(unsigned sched_ctx_id);
															
 
																 	void (*submit_hook)(struct starpu_task *task);
															
 
																-	void (*pre_exec_hook)(struct starpu_task *);
															
 
																-	void (*post_exec_hook)(struct starpu_task *);
															
 
																+	void (*pre_exec_hook)(struct starpu_task *, unsigned sched_ctx_id);
															
 
																+	void (*post_exec_hook)(struct starpu_task *, unsigned sched_ctx_id);
															
 
																 	void (*do_schedule)(unsigned sched_ctx_id);
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -196,6 +196,7 @@ struct starpu_task
 
																 	double flops;
															
 
																 	double predicted;
															
 
																 	double predicted_transfer;
															
 
																+	double predicted_start;
															
 
																 	struct starpu_task *prev;
															
 
																 	struct starpu_task *next;
															
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -129,6 +129,11 @@ char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type);
 
																 int starpu_bindid_get_workerids(int bindid, int **workerids);
															
 
																+int starpu_worker_get_devids(enum starpu_worker_archtype type, int *devids, int num);
															
 
																+
															
 
																+int starpu_worker_get_stream_workerids(int devid, int *workerids, enum starpu_worker_archtype type);
															
 
																+
															
 
																+unsigned starpu_worker_get_sched_ctx_id_stream(int stream_workerid);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -59,6 +59,9 @@ struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid, unsi
 
																 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
															
 
																 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																 			return _starpu_sched_ctx_get_perf_archtype(child_sched_ctx);
															
 
																+		struct _starpu_sched_ctx *stream_ctx = _starpu_worker_get_ctx_stream(workerid);
															
 
																+		if(stream_ctx != NULL)
															
 
																+			return _starpu_sched_ctx_get_perf_archtype(stream_ctx->id); 
															
 
																 	}
															
 
																 	struct _starpu_machine_config *config = _starpu_get_machine_config();
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -33,6 +33,7 @@ static size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
 
																 static double hyp_actual_start_sample[STARPU_NMAX_SCHED_CTXS];
															
 
																 static double window_size;
															
 
																 static int nobind;
															
 
																+static int occupied_sms = 0;
															
 
																 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
															
 
																 static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master);
															
@@ -297,7 +298,10 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
																 			{
															
 
																 				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].type = devices[dev1].type;
															
 
																 				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].devid = devices[dev1].devid;
															
 
																-				sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].ncores = devices[dev1].ncores;
															
 
																+				if (sched_ctx->stream_worker != -1)
															
 
																+					sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].ncores = sched_ctx->nsms;
															
 
																+				else
															
 
																+					sched_ctx->perf_arch.devices[sched_ctx->perf_arch.ndevices].ncores = devices[dev1].ncores;
															
 
																 				sched_ctx->perf_arch.ndevices++;
															
 
																 			}
															
 
																 			else
															
@@ -472,7 +476,8 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
																 						   int max_prio_set, int max_prio,
															
 
																 						   unsigned awake_workers,
															
 
																 						   void (*sched_policy_init)(unsigned),
															
 
																-						   void * user_data)
															
 
																+						   void * user_data,
															
 
																+						   int nsub_ctxs, int *sub_ctxs, int nsms)
															
 
																 {
															
 
																 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
															
@@ -526,6 +531,23 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
																 	sched_ctx->perf_arch.ndevices = 0;
															
 
																 	sched_ctx->init_sched = sched_policy_init;
															
 
																 	sched_ctx->user_data = user_data;
															
 
																+	sched_ctx->sms_start_idx = 0;
															
 
																+	sched_ctx->sms_end_idx = STARPU_NMAXSMS;
															
 
																+	sched_ctx->nsms = nsms;
															
 
																+	sched_ctx->stream_worker = -1;
															
 
																+	if(nsms > 0)
															
 
																+	{
															
 
																+		sched_ctx->sms_start_idx = occupied_sms;
															
 
																+		sched_ctx->sms_end_idx = occupied_sms+nsms;
															
 
																+		occupied_sms += nsms;
															
 
																+		printf("ctx %d: stream worker %d nsms %d ocupied sms %d\n", sched_ctx->id, workerids[0], nsms, occupied_sms);
															
 
																+		STARPU_ASSERT_MSG(occupied_sms <= STARPU_NMAXSMS , "STARPU:requested more sms than available");
															
 
																+		_starpu_worker_set_stream_ctx(workerids[0], sched_ctx);
															
 
																+		sched_ctx->stream_worker = workerids[0];
															
 
																+	}
															
 
																+
															
 
																+	sched_ctx->nsub_ctxs = 0;
															
 
																+
															
 
																 	int w;
															
 
																 	for(w = 0; w < nworkers; w++)
															
 
																 	{
															
@@ -565,6 +587,15 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
																 		  }
															
 
																 	}
															
 
																+        /*add sub_ctxs before add workers, in order to be able to associate them if necessary */
															
 
																+	if(nsub_ctxs != 0)
															
 
																+	{
															
 
																+		int i;
															
 
																+		for(i = 0; i < nsub_ctxs; i++)
															
 
																+			sched_ctx->sub_ctxs[i] = sub_ctxs[i];
															
 
																+		sched_ctx->nsub_ctxs = nsub_ctxs;
															
 
																+	}
															
 
																+	
															
 
																 	/* after having an worker_collection on the ressources add them */
															
 
																 	_starpu_add_workers_to_sched_ctx(sched_ctx, workerids, nworkers_ctx, NULL, NULL);
															
@@ -724,7 +755,7 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 
																 	for(i = 0; i < nw; i++)
															
 
																 		printf("%d ", workers[i]);
															
 
																 	printf("\n");
															
 
																-	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_ctx_name, 0, 0, 0, 0, 1, NULL, NULL);
															
 
																+	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_ctx_name, 0, 0, 0, 0, 1, NULL, NULL,0, NULL, 0);
															
 
																 	sched_ctx->min_ncpus = min_ncpus;
															
 
																 	sched_ctx->max_ncpus = max_ncpus;
															
 
																 	sched_ctx->min_ngpus = min_ngpus;
															
@@ -742,6 +773,45 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 
																 }
															
 
																+int starpu_sched_ctx_get_nsms(unsigned sched_ctx)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sc = _starpu_get_sched_ctx_struct(sched_ctx);
															
 
																+	return sc->nsms;
															
 
																+}
															
 
																+
															
 
																+void starpu_sched_ctx_get_sms_interval(int stream_workerid, int *start, int *end)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sc = _starpu_worker_get_ctx_stream(stream_workerid);
															
 
																+	*start = sc->sms_start_idx;
															
 
																+	*end = sc->sms_end_idx;
															
 
																+}
															
 
																+
															
 
																+int starpu_sched_ctx_get_sub_ctxs(unsigned sched_ctx, int *ctxs)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sc = _starpu_get_sched_ctx_struct(sched_ctx);
															
 
																+	int i;
															
 
																+	for(i = 0; i < sc->nsub_ctxs; i++)
															
 
																+		    ctxs[i] = sc->sub_ctxs[i];
															
 
																+	return sc->nsub_ctxs;
															
 
																+}
															
 
																+
															
 
																+int starpu_sched_ctx_get_stream_worker(unsigned sub_ctx)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sub_ctx);
															
 
																+	struct starpu_worker_collection *workers = sched_ctx->workers;
															
 
																+
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																+	int worker = -1;
															
 
																+	
															
 
																+	workers->init_iterator(workers, &it);
															
 
																+	if(workers->has_next(workers, &it))
															
 
																+	{
															
 
																+		worker = workers->get_next(workers, &it);
															
 
																+	}
															
 
																+
															
 
																+	return worker;
															
 
																+}
															
 
																+
															
 
																 unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx_name, ...)
															
 
																 {
															
 
																 	va_list varg_list;
															
@@ -750,6 +820,9 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
																 	int max_prio_set = 0;
															
 
																 	int min_prio = 0;
															
 
																 	int max_prio = 0;
															
 
																+	int nsms = 0;
															
 
																+        int *sub_ctxs = NULL;
															
 
																+        int nsub_ctxs = 0;
															
 
																 	void *user_data = NULL;
															
 
																 	struct starpu_sched_policy *sched_policy = NULL;
															
 
																 	unsigned hierarchy_level = 0;
															
@@ -800,6 +873,15 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
																 		{
															
 
																 			user_data = va_arg(varg_list, void *);
															
 
																 		}
															
 
																+		else if (arg_type == STARPU_SCHED_CTX_SUB_CTXS)
															
 
																+		{
															
 
																+			sub_ctxs = va_arg(varg_list, int*);
															
 
																+			nsub_ctxs = va_arg(varg_list, int);
															
 
																+		}
															
 
																+		else if (arg_type == STARPU_SCHED_CTX_CUDA_NSMS)
															
 
																+		{
															
 
																+			nsms = va_arg(varg_list, int);
															
 
																+		}
															
 
																 		else
															
 
																 		{
															
 
																 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
															
@@ -824,7 +906,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
																 	}
															
 
																 	struct _starpu_sched_ctx *sched_ctx = NULL;
															
 
																-	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers, init_sched, user_data);
															
 
																+	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers, init_sched, user_data, nsub_ctxs, sub_ctxs, nsms);
															
 
																 	sched_ctx->hierarchy_level = hierarchy_level;
															
 
																 	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
															
@@ -848,6 +930,9 @@ int fstarpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx
 
																 	int max_prio_set = 0;
															
 
																 	int min_prio = 0;
															
 
																 	int max_prio = 0;
															
 
																+	int nsms = 0;
															
 
																+        int *sub_ctxs = NULL;
															
 
																+        int nsub_ctxs = 0;
															
 
																 	void *user_data = NULL;
															
 
																 	struct starpu_sched_policy *sched_policy = NULL;
															
 
																 	unsigned hierarchy_level = 0;
															
@@ -910,6 +995,19 @@ int fstarpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx
 
																 			arg_i++;
															
 
																 			user_data = arglist[arg_i];
															
 
																 		}
															
 
																+		else if (arg_type == STARPU_SCHED_CTX_SUB_CTXS)
															
 
																+		{
															
 
																+			arg_i++;
															
 
																+			sub_ctxs = (int*)arglist[arg_i]; 
															
 
																+			arg_i++;
															
 
																+			nsub_ctxs = *(int*)arglist[arg_i]; 
															
 
																+		}
															
 
																+		else if (arg_type == STARPU_SCHED_CTX_CUDA_NSMS)
															
 
																+		{
															
 
																+			arg_i++;
															
 
																+			nsms = *(int*)arglist[arg_i]; 
															
 
																+		}
															
 
																+
															
 
																 		else
															
 
																 		{
															
 
																 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
															
@@ -933,7 +1031,7 @@ int fstarpu_sched_ctx_create(int *workerids, int nworkers, const char *sched_ctx
 
																 	}
															
 
																 	struct _starpu_sched_ctx *sched_ctx = NULL;
															
 
																-	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers, init_sched, user_data);
															
 
																+	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio, awake_workers, init_sched, user_data, nsub_ctxs, sub_ctxs, nsms);
															
 
																 	sched_ctx->hierarchy_level = hierarchy_level;
															
 
																 	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
															
@@ -1015,7 +1113,8 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 #ifdef STARPU_USE_SC_HYPERVISOR
															
 
																-	if (sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS && sched_ctx->perf_counters != NULL)
															
 
																+	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
															
 
																+	   && sched_ctx->perf_counters != NULL)
															
 
																 	{
															
 
																 		_STARPU_TRACE_HYPERVISOR_BEGIN();
															
 
																 		sched_ctx->perf_counters->notify_delete_context(sched_ctx_id);
															
@@ -1062,6 +1161,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
																 	   you don't use it anymore */
															
 
																 	free(workerids);
															
 
																 	_starpu_relock_mutex_if_prev_locked();
															
 
																+	occupied_sms -= sched_ctx->nsms;
															
 
																 	return;
															
 
																 }
															
@@ -2092,7 +2192,8 @@ void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double ready_f
 
																         _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, ready_flops);
															
 
																 }
															
 
																-void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex)
															
 
																+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex, 
															
 
																+				       unsigned with_repush)
															
 
																 {
															
 
																 	/* TODO: make something cleaner which differentiates between calls
															
 
																 	   from push or pop (have mutex or not) and from another worker or not */
															
@@ -2111,7 +2212,10 @@ void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_
 
																 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
															
 
																-	_starpu_repush_task(j);
															
 
																+	if(with_repush)
															
 
																+		_starpu_repush_task(j);
															
 
																+	else
															
 
																+		_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops, j->task);
															
 
																 	if(workerid != -1 && manage_mutex)
															
 
																 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
															
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -36,7 +36,7 @@
 
																 #define DO_RESIZE 1
															
 
																 #define STARPU_GLOBAL_SCHED_CTX 0
															
 
																-
															
 
																+#define STARPU_NMAXSMS 13
															
 
																 struct _starpu_sched_ctx
															
 
																 {
															
 
																 	/* id of the context used in user mode*/
															
@@ -174,6 +174,16 @@ struct _starpu_sched_ctx
 
																 	/* function called when initializing the scheduler */
															
 
																 	void (*init_sched)(unsigned);
															
 
																+
															
 
																+	int sub_ctxs[STARPU_NMAXWORKERS];
															
 
																+	int nsub_ctxs;
															
 
																+
															
 
																+	/* nr of SMs assigned to this ctx if we partition gpus*/
															
 
																+	int nsms;
															
 
																+	int sms_start_idx;
															
 
																+	int sms_end_idx;
															
 
																+
															
 
																+	int stream_worker;
															
 
																 };
															
 
																 struct _starpu_machine_config;
															
@@ -184,7 +194,8 @@ void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config);
 
																 /* allocate all structures belonging to a context */
															
 
																 struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name,
															
 
																 						    int min_prio_set, int min_prio,
															
 
																-						    int max_prio_set, int max_prio, unsigned awake_workers, void (*sched_policy_init)(unsigned), void *user_data);
															
 
																+						    int max_prio_set, int max_prio, unsigned awake_workers, void (*sched_policy_init)(unsigned), void *user_data,
															
 
																+							int nsub_ctxs, int *sub_ctxs, int nsms);
															
 
																 /* delete all sched_ctx */
															
 
																 void _starpu_delete_all_sched_ctxs();
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -1009,9 +1009,34 @@ void _starpu_sched_pre_exec_hook(struct starpu_task *task)
 
																 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook)
															
 
																 	{
															
 
																 		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																-		sched_ctx->sched_policy->pre_exec_hook(task);
															
 
																+		sched_ctx->sched_policy->pre_exec_hook(task, sched_ctx_id);
															
 
																 		_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																 	}
															
 
																+
															
 
																+	if(!sched_ctx->sched_policy)
															
 
																+	{
															
 
																+		int workerid = starpu_worker_get_id();
															
 
																+		struct _starpu_worker *worker =  _starpu_get_worker_struct(workerid);
															
 
																+		struct _starpu_sched_ctx *other_sched_ctx;
															
 
																+		struct _starpu_sched_ctx_elt *e = NULL;
															
 
																+		struct _starpu_sched_ctx_list_iterator list_it;
															
 
																+		
															
 
																+		_starpu_sched_ctx_list_iterator_init(worker->sched_ctx_list, &list_it);
															
 
																+		while (_starpu_sched_ctx_list_iterator_has_next(&list_it))
															
 
																+		{
															
 
																+			e = _starpu_sched_ctx_list_iterator_get_next(&list_it);
															
 
																+			other_sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
															
 
																+			if (other_sched_ctx != sched_ctx && 
															
 
																+			    other_sched_ctx->sched_policy != NULL && 
															
 
																+			    other_sched_ctx->sched_policy->pre_exec_hook)
															
 
																+			{
															
 
																+				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																+				other_sched_ctx->sched_policy->pre_exec_hook(task, other_sched_ctx->id);
															
 
																+				_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																 }
															
 
																 void _starpu_sched_post_exec_hook(struct starpu_task *task)
															
@@ -1021,9 +1046,32 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 
																 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->post_exec_hook)
															
 
																 	{
															
 
																 		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																-		sched_ctx->sched_policy->post_exec_hook(task);
															
 
																+		sched_ctx->sched_policy->post_exec_hook(task, sched_ctx_id);
															
 
																 		_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																 	}
															
 
																+	if(!sched_ctx->sched_policy)
															
 
																+	{
															
 
																+		int workerid = starpu_worker_get_id();
															
 
																+		struct _starpu_worker *worker =  _starpu_get_worker_struct(workerid);
															
 
																+		struct _starpu_sched_ctx *other_sched_ctx;
															
 
																+		struct _starpu_sched_ctx_elt *e = NULL;
															
 
																+		struct _starpu_sched_ctx_list_iterator list_it;
															
 
																+		
															
 
																+		_starpu_sched_ctx_list_iterator_init(worker->sched_ctx_list, &list_it);
															
 
																+		while (_starpu_sched_ctx_list_iterator_has_next(&list_it))
															
 
																+		{
															
 
																+			e = _starpu_sched_ctx_list_iterator_get_next(&list_it);
															
 
																+			other_sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
															
 
																+			if (other_sched_ctx != sched_ctx && 
															
 
																+			    other_sched_ctx->sched_policy != NULL && 
															
 
																+			    other_sched_ctx->sched_policy->post_exec_hook)
															
 
																+			{
															
 
																+				_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																+				other_sched_ctx->sched_policy->post_exec_hook(task, other_sched_ctx->id);
															
 
																+				_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																 }
															
 
																 void _starpu_wait_on_sched_event(void)
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -94,6 +94,7 @@ void starpu_task_init(struct starpu_task *task)
 
																 	task->predicted = NAN;
															
 
																 	task->predicted_transfer = NAN;
															
 
																+	task->predicted_start = NAN;
															
 
																 	task->magic = 42;
															
 
																 	task->sched_ctx = STARPU_NMAX_SCHED_CTXS;
															
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -89,7 +89,12 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	if (d->type == STARPU_CUDA_WORKER)
															
 
																-		return &cuda_worker_set[d->id.cuda_id];
															
 
																+	{
															
 
																+		unsigned th_per_stream = starpu_get_env_number_default("STARPU_ONE_THREAD_PER_STREAM", 1);
															
 
																+		if(th_per_stream == 0)
															
 
																+			return &cuda_worker_set[d->id.cuda_id];
															
 
																+
															
 
																+	}
															
 
																 #endif
															
 
																 	for (workerid = 0; workerid < nworkers; workerid++)
															
@@ -116,6 +121,16 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
																 				break;
															
 
																 			}
															
 
																 #endif
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+			case STARPU_CUDA_WORKER:
															
 
																+			{
															
 
																+				if (worker->devid == d->id.cuda_id)
															
 
																+					return &worker->set;
															
 
																+				break;
															
 
																+
															
 
																+			}
															
 
																+#endif
															
 
																+
															
 
																 			default:
															
 
																 				_STARPU_DEBUG("Invalid device type\n");
															
 
																 				return NULL;
															
@@ -1038,17 +1053,27 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
																 	_starpu_initialize_workers_cuda_gpuid(config);
															
 
																+	/* allow having one worker per stream */
															
 
																+	unsigned th_per_stream = starpu_get_env_number_default("STARPU_WORKER_PER_STREAM", 1);
															
 
																+
															
 
																 	unsigned cudagpu;
															
 
																 	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
															
 
																 	{
															
 
																 		int devid = _starpu_get_next_cuda_gpuid(config);
															
 
																 		int worker_idx0 = topology->nworkers + cudagpu * nworker_per_cuda;
															
 
																 		cuda_worker_set[devid].workers = &config->workers[worker_idx0];
															
 
																+
															
 
																 		for (i = 0; i < nworker_per_cuda; i++)
															
 
																 		{
															
 
																 			int worker_idx = worker_idx0 + i;
															
 
																+			if(th_per_stream)
															
 
																+			{
															
 
																+				config->workers[worker_idx].set = (struct _starpu_worker_set *)malloc(sizeof(struct _starpu_worker_set));
															
 
																+				config->workers[worker_idx].set->workers = &config->workers[worker_idx];
															
 
																+			}
															
 
																+			else
															
 
																+				config->workers[worker_idx].set = &cuda_worker_set[devid];
															
 
																-			config->workers[worker_idx].set = &cuda_worker_set[devid];
															
 
																 			config->workers[worker_idx].arch = STARPU_CUDA_WORKER;
															
 
																 			_STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
															
 
																 			config->workers[worker_idx].perf_arch.ndevices = 1;
															
@@ -1554,7 +1579,6 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 		int *preferred_binding = NULL;
															
 
																 		int npreferred = 0;
															
 
																 #endif
															
 
																-
															
 
																 		/* select the memory node that contains worker's memory */
															
 
																 		switch (workerarg->arch)
															
 
																 		{
															
@@ -1601,7 +1625,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 				{
															
 
																 					memory_node = cuda_memory_nodes[devid];
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-					workerarg->bindid = cuda_bindid[devid];
															
 
																+					workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);//cuda_bindid[devid];
															
 
																 #endif /* SIMGRID */
															
 
																 				}
															
 
																 				else
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -371,7 +371,7 @@ int starpu_worker_can_execute_task_first_impl(unsigned workerid, struct starpu_t
 
																 	{
															
 
																 		for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
															
 
																 			if (_starpu_can_use_nth_implementation(arch, cl, i)
															
 
																-			 && task->cl->can_execute(workerid, task, i))
															
 
																+			 && (!task->cl->can_execute || task->cl->can_execute(workerid, task, i)))
															
 
																 			{
															
 
																 				if (nimpl)
															
 
																 					*nimpl = i;
															
@@ -676,12 +676,16 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																 			case STARPU_CUDA_WORKER:
															
 
																 				driver.id.cuda_id = devid;
															
 
																-
															
 
																-				/* We spawn only one thread per CUDA driver,
															
 
																-				 * which will control all CUDA workers of this
															
 
																-				 * driver. (by using a worker set). */
															
 
																-				if (worker_set->workers != workerarg)
															
 
																-					break;
															
 
																+				/* allow having one worker per stream */
															
 
																+				unsigned th_per_stream = starpu_get_env_number_default("STARPU_ONE_THREAD_PER_STREAM", 1);
															
 
																+				if(th_per_stream == 0)
															
 
																+				{
															
 
																+					/* We spawn only one thread per CUDA driver,
															
 
																+					 * which will control all CUDA workers of this
															
 
																+					 * driver. (by using a worker set). */
															
 
																+					if (worker_set->workers != workerarg)
															
 
																+						break;
															
 
																+				}
															
 
																 				worker_set->nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
															
@@ -701,19 +705,53 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 					break;
															
 
																 				}
															
 
																-				STARPU_PTHREAD_CREATE_ON(
															
 
																-					workerarg->name,
															
 
																-					&worker_set->worker_thread,
															
 
																-					NULL,
															
 
																-					_starpu_cuda_worker,
															
 
																-					worker_set,
															
 
																-					_starpu_simgrid_get_host_by_worker(workerarg));
															
 
																+
															
 
																+				if(th_per_stream == 0)
															
 
																+				{
															
 
																+					STARPU_PTHREAD_CREATE_ON(
															
 
																+						workerarg->name,
															
 
																+						&worker_set->worker_thread,
															
 
																+						NULL,
															
 
																+						_starpu_cuda_worker,
															
 
																+						worker_set,
															
 
																+						_starpu_simgrid_get_host_by_worker(workerarg));
															
 
																+				}
															
 
																+				else
															
 
																+				{
															
 
																+					worker_set->nworkers = 1;
															
 
																+					STARPU_PTHREAD_CREATE_ON(
															
 
																+						workerarg->name,
															
 
																+						&workerarg->worker_thread,
															
 
																+						NULL,
															
 
																+						_starpu_cuda_worker,
															
 
																+//						workerarg,
															
 
																+						worker_set,
															
 
																+						_starpu_simgrid_get_host_by_worker(workerarg));
															
 
																+				}
															
 
																 #ifdef STARPU_USE_FXT
															
 
																 				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
															
 
																 				while (!workerarg->worker_is_running)
															
 
																 					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
															
 
																 				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
															
 
																 #endif
															
 
																+
															
 
																+				if(th_per_stream == 0)
															
 
																+				{
															
 
																+
															
 
																+					STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
															
 
																+					while (!worker_set->set_is_initialized)
															
 
																+						STARPU_PTHREAD_COND_WAIT(&worker_set->ready_cond,
															
 
																+									 &worker_set->mutex);
															
 
																+					STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
															
 
																+				}
															
 
																+				else
															
 
																+				{
															
 
																+					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
															
 
																+					while (!workerarg->worker_is_initialized)
															
 
																+						STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
															
 
																+					STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
															
 
																+				}
															
 
																+				worker_set->started = 1;
															
 
																 				break;
															
 
																 #endif
															
 
																 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
															
@@ -809,9 +847,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 		struct starpu_driver driver;
															
 
																 		unsigned devid = workerarg->devid;
															
 
																 		driver.type = workerarg->arch;
															
 
																-#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																-		struct _starpu_worker_set *worker_set = workerarg->set;
															
 
																-#endif
															
 
																 		switch (workerarg->arch)
															
 
																 		{
															
@@ -827,19 +862,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 				break;
															
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																 			case STARPU_CUDA_WORKER:
															
 
																-#ifndef STARPU_SIMGRID
															
 
																-				driver.id.cuda_id = devid;
															
 
																-				if (!_starpu_may_launch_driver(&pconfig->conf, &driver))
															
 
																-					break;
															
 
																-#endif
															
 
																-				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
															
 
																-				STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
															
 
																-				while (!worker_set->set_is_initialized)
															
 
																-					STARPU_PTHREAD_COND_WAIT(&worker_set->ready_cond,
															
 
																-								 &worker_set->mutex);
															
 
																-				STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
															
 
																-				worker_set->started = 1;
															
 
																-
															
 
																+				/* Already waited above */
															
 
																 				break;
															
 
																 #endif
															
 
																 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
															
@@ -1248,7 +1271,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
																 	if (!is_a_sink)
															
 
																 	{
															
 
																 		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&_starpu_config, _starpu_config.conf.sched_policy_name);
															
 
																-		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_min_priority, (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_max_priority, 1, _starpu_config.conf.sched_policy_init, NULL);
															
 
																+		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_min_priority, (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_max_priority, 1, _starpu_config.conf.sched_policy_init, NULL,  0, NULL, 0);
															
 
																 	}
															
 
																 	_starpu_initialize_registered_performance_models();
															
@@ -1310,7 +1333,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 
																 		/* in case StarPU termination code is called from a callback,
															
 
																  		 * we have to check if pthread_self() is the worker itself */
															
 
																-		if (set)
															
 
																+		if (set && set->nworkers > 1)
															
 
																 		{
															
 
																 			if (set->started)
															
 
																 			{
															
@@ -1842,6 +1865,47 @@ int starpu_worker_get_by_devid(enum starpu_worker_archtype type, int devid)
 
																 	return -1;
															
 
																 }
															
 
																+int starpu_worker_get_devids(enum starpu_worker_archtype type, int *devids, int num)
															
 
																+{
															
 
																+	int cnt = 0;
															
 
																+	unsigned nworkers = starpu_worker_get_count();
															
 
																+	int *workerids = (int *)malloc(nworkers*sizeof(int));
															
 
																+
															
 
																+	int ndevice_workers = starpu_worker_get_ids_by_type(type, workerids, nworkers);
															
 
																+
															
 
																+	int ndevids = 0;
															
 
																+
															
 
																+	if(ndevice_workers > 0)
															
 
																+	{
															
 
																+		unsigned id, devid;
															
 
																+		int curr_devid = -1;
															
 
																+		unsigned found = 0;
															
 
																+		for(id = 0; id < ndevice_workers; id++)
															
 
																+		{
															
 
																+			curr_devid = _starpu_config.workers[workerids[id]].devid;
															
 
																+			for(devid = 0; devid < ndevids; devid++)
															
 
																+			{
															
 
																+				if(curr_devid == devids[devid])
															
 
																+				{
															
 
																+					found = 1;
															
 
																+					break;
															
 
																+				}
															
 
																+			}
															
 
																+			if(!found)
															
 
																+			{
															
 
																+				devids[ndevids++] = curr_devid;
															
 
																+				cnt++;
															
 
																+			}
															
 
																+			else
															
 
																+				found = 0;
															
 
																+
															
 
																+			if(cnt == num)
															
 
																+				break;
															
 
																+		}
															
 
																+	}
															
 
																+	return ndevids;
															
 
																+}
															
 
																+
															
 
																 void starpu_worker_get_name(int id, char *dst, size_t maxlen)
															
 
																 {
															
 
																 	char *name = _starpu_config.workers[id].name;
															
@@ -1862,6 +1926,19 @@ int starpu_bindid_get_workerids(int bindid, int **workerids)
 
																 	return _starpu_config.bindid_workers[bindid].nworkers;
															
 
																 }
															
 
																+int starpu_worker_get_stream_workerids(int devid, int *workerids, enum starpu_worker_archtype type)
															
 
																+{
															
 
																+	unsigned nworkers = starpu_worker_get_count();
															
 
																+	int nw = 0;
															
 
																+	unsigned id;
															
 
																+	for (id = 0; id < nworkers; id++)
															
 
																+	{
															
 
																+		if (_starpu_config.workers[id].devid == devid && _starpu_config.workers[id].arch == type)
															
 
																+			workerids[nw++] = id;
															
 
																+	}
															
 
																+	return nw;
															
 
																+}
															
 
																+
															
 
																 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond)
															
 
																 {
															
 
																 	*sched_cond = &_starpu_config.workers[workerid].sched_cond;
															
@@ -2142,3 +2219,23 @@ char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 
																 	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
															
 
																 	return "STARPU_unknown_WORKER";
															
 
																 }
															
 
																+
															
 
																+void _starpu_worker_set_stream_ctx(int workerid, struct _starpu_sched_ctx *sched_ctx)
															
 
																+{
															
 
																+        struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
															
 
																+        w->stream_ctx = sched_ctx;
															
 
																+}
															
 
																+
															
 
																+struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(int stream_workerid)
															
 
																+{
															
 
																+        struct _starpu_worker *w = _starpu_get_worker_struct(stream_workerid);
															
 
																+        return w->stream_ctx;
															
 
																+}
															
 
																+
															
 
																+unsigned starpu_worker_get_sched_ctx_id_stream(int stream_workerid)
															
 
																+{
															
 
																+        struct _starpu_worker *w = _starpu_get_worker_struct(stream_workerid);
															
 
																+	return w->stream_ctx != NULL ? w->stream_ctx->id : STARPU_NMAX_SCHED_CTXS;
															
 
																+}
															
 
																+
															
 
																+
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -133,6 +133,8 @@ LIST_TYPE(_starpu_worker,
 
																 	/* bool to indicate if the worker is slave in a ctx */
															
 
																 	unsigned is_slave_somewhere;
															
 
																+	struct _starpu_sched_ctx *stream_ctx;
															
 
																+
															
 
																 #ifdef __GLIBC__
															
 
																 	cpu_set_t cpu_set;
															
 
																 #endif /* __GLIBC__ */
															
@@ -576,4 +578,8 @@ static inline unsigned __starpu_worker_get_id_check(const char *f, int l)
 
																 }
															
 
																 #define _starpu_worker_get_id_check(f,l) __starpu_worker_get_id_check(f,l)
															
 
																+void _starpu_worker_set_stream_ctx(int workerid, struct _starpu_sched_ctx *sched_ctx);
															
 
																+
															
 
																+struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(int stream_workerid);
															
 
																+
															
 
																 #endif // __WORKERS_H__
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -70,6 +70,9 @@ static starpu_pthread_mutex_t task_mutex[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE
 
																 static starpu_pthread_cond_t task_cond[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
															
 
																 #endif /* STARPU_SIMGRID */
															
 
																+static unsigned cuda_memnode_deinit[STARPU_MAXCUDADEVS];
															
 
																+static starpu_pthread_mutex_t cuda_deinit_mutex[STARPU_MAXCUDADEVS];
															
 
																+
															
 
																 void
															
 
																 _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
															
 
																 {
															
@@ -676,11 +679,16 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
																 	STARPU_PTHREAD_COND_SIGNAL(&worker0->ready_cond);
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&worker0->mutex);
															
 
																-	/* tell the main thread that this one is ready */
															
 
																-	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
															
 
																-	worker_set->set_is_initialized = 1;
															
 
																-	STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
															
 
																-	STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
															
 
																+	unsigned th_per_stream = starpu_get_env_number_default("STARPU_ONE_THREAD_PER_STREAM", 1);
															
 
																+
															
 
																+	if(th_per_stream == 0)
															
 
																+	{
															
 
																+		/* tell the main thread that this one is ready */
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
															
 
																+		worker_set->set_is_initialized = 1;
															
 
																+		STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
															
 
																+	}
															
 
																 	return 0;
															
 
																 }
															
@@ -852,18 +860,27 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 
																 			continue;
															
 
																 		lastdevid = devid;
															
 
																-		_starpu_handle_all_pending_node_data_requests(memnode);
															
 
																-
															
 
																-		/* In case there remains some memory that was automatically
															
 
																-		 * allocated by StarPU, we release it now. Note that data
															
 
																-		 * coherency is not maintained anymore at that point ! */
															
 
																-		_starpu_free_all_automatically_allocated_buffers(memnode);
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&cuda_deinit_mutex[memnode]);
															
 
																+		if(!cuda_memnode_deinit[devid])
															
 
																+                {
															
 
																-		_starpu_malloc_shutdown(memnode);
															
 
																+			_starpu_handle_all_pending_node_data_requests(memnode);
															
 
																+			
															
 
																+			/* In case there remains some memory that was automatically
															
 
																+			 * allocated by StarPU, we release it now. Note that data
															
 
																+			 * coherency is not maintained anymore at that point ! */
															
 
																+			_starpu_free_all_automatically_allocated_buffers(memnode);
															
 
																+			
															
 
																+			_starpu_malloc_shutdown(memnode);
															
 
																+			cuda_memnode_deinit[devid] = 1;
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-		deinit_device_context(devid);
															
 
																+			deinit_device_context(devid);
															
 
																 #endif /* !STARPU_SIMGRID */
															
 
																+                }
															
 
																+
															
 
																+                STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_deinit_mutex[memnode]);
															
 
																+
															
 
																 	}
															
 
																 	for (i = 0; i < worker_set->nworkers; i++)
															
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -196,6 +196,26 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 
																 	task = _starpu_fifo_pop_first_ready_task(fifo, node, dt->num_priorities);
															
 
																 	if (task)
															
 
																 	{
															
 
																+		/* We now start the transfer, get rid of it in the completion
															
 
																+		 * prediction */
															
 
																+		double transfer_model = task->predicted_transfer;
															
 
																+		if(!isnan(transfer_model)) 
															
 
																+		{
															
 
																+			fifo->exp_len -= transfer_model;
															
 
																+			fifo->exp_start = starpu_timing_now() + transfer_model;
															
 
																+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																+			if(dt->num_priorities != -1)
															
 
																+			{
															
 
																+				int i;
															
 
																+				int task_prio = _normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
															
 
																+				for(i = 0; i <= task_prio; i++)
															
 
																+					fifo->exp_len_per_priority[i] -= transfer_model;
															
 
																+			}
															
 
																+
															
 
																+			fifo->pipeline_len += task->predicted + transfer_model;
															
 
																+			fifo->pipelined_tasks++;
															
 
																+		}
															
 
																+
															
 
																 		starpu_sched_ctx_list_task_counters_decrement(sched_ctx_id, workerid);
															
 
																 #ifdef STARPU_VERBOSE
															
@@ -230,8 +250,30 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 
																 	task = _starpu_fifo_pop_local_task(fifo);
															
 
																 	if (task)
															
 
																 	{
															
 
																-		starpu_sched_ctx_list_task_counters_decrement(sched_ctx_id, workerid);
															
 
																+		double transfer_model = task->predicted_transfer;
															
 
																+		/* We now start the transfer, get rid of it in the completion
															
 
																+		 * prediction */
															
 
																+
															
 
																+		if(!isnan(transfer_model)) 
															
 
																+		{
															
 
																+			double model = task->predicted;
															
 
																+			fifo->exp_len -= transfer_model;
															
 
																+			fifo->exp_start = starpu_timing_now() + transfer_model+model;
															
 
																+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																+			if(dt->num_priorities != -1)
															
 
																+			{
															
 
																+				int i;
															
 
																+				int task_prio = _normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
															
 
																+				for(i = 0; i <= task_prio; i++)
															
 
																+					fifo->exp_len_per_priority[i] -= transfer_model;
															
 
																+			}
															
 
																+			fifo->pipeline_len += task->predicted + transfer_model;
															
 
																+			fifo->pipelined_tasks++;
															
 
																+
															
 
																+		}
															
 
																+		starpu_sched_ctx_list_task_counters_decrement(sched_ctx_id, workerid);
															
 
																+		  
															
 
																 #ifdef STARPU_VERBOSE
															
 
																 		if (task->cl)
															
 
																 		{
															
@@ -268,6 +310,28 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
																 	starpu_sched_ctx_list_task_counters_reset(sched_ctx_id, workerid);
															
 
																+	while (new_list)
															
 
																+	{
															
 
																+		double transfer_model = new_list->predicted_transfer;
															
 
																+		/* We now start the transfer, get rid of it in the completion
															
 
																+		 * prediction */
															
 
																+		if(!isnan(transfer_model)) 
															
 
																+		{
															
 
																+			fifo->exp_len -= transfer_model;
															
 
																+			fifo->exp_start = starpu_timing_now() + transfer_model;
															
 
																+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																+			if(dt->num_priorities != -1)
															
 
																+			{
															
 
																+				int i;
															
 
																+				for(i = 0; i < new_list->priority; i++)
															
 
																+					fifo->exp_len_per_priority[i] -= transfer_model;
															
 
																+			}
															
 
																+		
															
 
																+		}
															
 
																+
															
 
																+		new_list = new_list->next;
															
 
																+	}
															
 
																+
															
 
																 	return new_list;
															
 
																 }
															
@@ -282,7 +346,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																         if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																         {
															
 
																-                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx, 0);
															
 
																+                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx, 0, 1);
															
 
																 		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
															
 
																                 return 0;
															
 
																         }
															
@@ -362,6 +426,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 	}
															
 
																 	STARPU_AYU_ADDTOTASKQUEUE(_starpu_get_job_associated_to_task(task)->job_id, best_workerid);
															
 
																+	unsigned stream_ctx_id = starpu_worker_get_sched_ctx_id_stream(best_workerid);
															
 
																+	if(stream_ctx_id != STARPU_NMAX_SCHED_CTXS)
															
 
																+	{
															
 
																+		starpu_sched_ctx_move_task_to_ctx(task, stream_ctx_id, 0, 0);
															
 
																+		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
															
 
																+	}
															
 
																+
															
 
																 	int ret = 0;
															
 
																 	if (prio)
															
 
																 	{
															
@@ -584,6 +655,8 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
																 		/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																 		double exp_start = isnan(fifo->exp_start) ? starpu_timing_now() : STARPU_MAX(fifo->exp_start, starpu_timing_now());
															
 
																+		exp_start += fifo->pipeline_len;
															
 
																+
															
 
																 		if (!starpu_worker_can_execute_task_impl(worker, task, &impl_mask))
															
 
																 			continue;
															
@@ -852,8 +925,6 @@ static double _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																-//		double max_len = (max_exp_end - starpu_timing_now());
															
 
																-		/* printf("%d: dmda max_exp_end %lf best_exp_end %lf max_len %lf \n", sched_ctx_id, max_exp_end/1000000.0, best_exp_end/1000000.0, max_len/1000000.0);	 */
															
 
																 		return exp_end[best_in_ctx][selected_impl] ;
															
 
																 	}
															
 
																 }
															
@@ -1022,9 +1093,8 @@ static void deinitialize_dmda_policy(unsigned sched_ctx_id)
 
																 /* dmda_pre_exec_hook is called right after the data transfer is done and right
															
 
																  * before the computation to begin, it is useful to update more precisely the
															
 
																  * value of the expected start, end, length, etc... */
															
 
																-static void dmda_pre_exec_hook(struct starpu_task *task)
															
 
																+static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
															
 
																 {
															
 
																-	unsigned sched_ctx_id = starpu_sched_ctx_get_ctx_for_task(task);
															
 
																 	unsigned workerid = starpu_worker_get_id_check();
															
 
																 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
															
@@ -1039,30 +1109,27 @@ static void dmda_pre_exec_hook(struct starpu_task *task)
 
																 	 * of work. */
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(sched_mutex);
															
 
																-	/* Take the opportunity to update start time */
															
 
																-	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
															
 
																-
															
 
																-	if(!isnan(transfer_model))
															
 
																+	if(fifo->pipelined_tasks > 0)
															
 
																 	{
															
 
																-		/* The transfer is over, get rid of it in the completion
															
 
																-		 * prediction */
															
 
																-		fifo->exp_len -= transfer_model;
															
 
																-		if(dt->num_priorities != -1)
															
 
																-		{
															
 
																-			int i;
															
 
																-			int task_prio = _normalize_prio(task->priority, dt->num_priorities, task->sched_ctx);
															
 
																-			for(i = 0; i <= task_prio; i++)
															
 
																-				fifo->exp_len_per_priority[i] -= transfer_model;
															
 
																-		}
															
 
																-
															
 
																+		/* decrement here bc we add the predicted exec time of the task to exp_start
															
 
																+		   we don't want to add it twice */
															
 
																+		if (!isnan(task->predicted))
															
 
																+			fifo->pipeline_len -= task->predicted;
															
 
																+		if(!isnan(task->predicted_transfer))
															
 
																+			fifo->pipeline_len -= task->predicted_transfer;
															
 
																+		fifo->pipelined_tasks--;
															
 
																 	}
															
 
																+	/* Take the opportunity to update start time */
															
 
																+	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
															
 
																+
															
 
																 	if(!isnan(model))
															
 
																 	{
															
 
																 		/* We now start the computation, get rid of it in the completion
															
 
																 		 * prediction */
															
 
																-		fifo->exp_len -= model;
															
 
																-		fifo->exp_start += model;
															
 
																+		fifo->exp_len-= model;
															
 
																+                fifo->exp_start = starpu_timing_now() + model;
															
 
																+                fifo->exp_end= fifo->exp_start + fifo->exp_len;
															
 
																 		if(dt->num_priorities != -1)
															
 
																 		{
															
 
																 			int i;
															
@@ -1072,7 +1139,6 @@ static void dmda_pre_exec_hook(struct starpu_task *task)
 
																 		}
															
 
																 	}
															
 
																-	fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
															
 
																 }
															
@@ -1155,9 +1221,8 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, int pe
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
															
 
																 }
															
 
																-static void dmda_post_exec_hook(struct starpu_task * task)
															
 
																+static void dmda_post_exec_hook(struct starpu_task * task, unsigned sched_ctx_id)
															
 
																 {
															
 
																-	unsigned sched_ctx_id = starpu_sched_ctx_get_ctx_for_task(task);
															
 
																 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																 	unsigned workerid = starpu_worker_get_id_check();
															
 
																 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
															
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -182,7 +182,7 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
																 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
															
 
																 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																 		{
															
 
																-			starpu_sched_ctx_move_task_to_ctx(chosen_task, child_sched_ctx, 1);
															
 
																+			starpu_sched_ctx_move_task_to_ctx(chosen_task, child_sched_ctx, 1, 1);
															
 
																 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, chosen_task->flops);
															
 
																 			return NULL;
															
 
																 		}
															
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -289,7 +289,7 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
																                 unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
															
 
																 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																 		{
															
 
																-			starpu_sched_ctx_move_task_to_ctx(chosen_task, child_sched_ctx, 1);
															
 
																+			starpu_sched_ctx_move_task_to_ctx(chosen_task, child_sched_ctx, 1, 1);
															
 
																 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, chosen_task->flops);
															
 
																 			return NULL;
															
 
																 		}
															
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -56,6 +56,8 @@ struct _starpu_fifo_taskq *_starpu_create_fifo(void)
 
																 	fifo->exp_len = 0.0;
															
 
																 	fifo->exp_end = fifo->exp_start;
															
 
																 	fifo->exp_len_per_priority = NULL;
															
 
																+	fifo->pipeline_len = 0.0;
															
 
																+	fifo->pipelined_tasks = 0;
															
 
																 	return fifo;
															
 
																 }
															
--- a/src/sched_policies/fifo_queues.h
+++ b/src/sched_policies/fifo_queues.h
@@ -42,6 +42,8 @@ struct _starpu_fifo_taskq
 
																 	double exp_end; /* Expected end date of last task in the queue */
															
 
																 	double exp_len; /* Expected duration of the set of tasks in the queue */
															
 
																 	double *exp_len_per_priority; /* Expected duration of the set of tasks in the queue corresponding to each priority */
															
 
																+	double pipeline_len; /* the expected the length of the pipelined tasks */
															
 
																+	int pipelined_tasks; /* the expected no of pipelined tasks */
															
 
																 };
															
 
																 struct _starpu_fifo_taskq*_starpu_create_fifo(void) STARPU_ATTRIBUTE_MALLOC;
															
--- a/src/sched_policies/heteroprio.c
+++ b/src/sched_policies/heteroprio.c
@@ -609,7 +609,7 @@ done:		;
 
																 		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
															
 
																 		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																 		{
															
 
																-			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx, 1);
															
 
																+			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx, 1, 1);
															
 
																 			starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
															
 
																 			return NULL;
															
 
																 		}