13 年之前 · 2f4491e804
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -167,7 +167,7 @@ Return the expected power consumption of the entire task bundle in J.
 
																 @section Task Lists
															
 
																 @deftp {Data Type} {struct starpu_task_list}
															
 
																-todo
															
 
																+Stores a double-chained list of tasks
															
 
																 @end deftp
															
 
																 @deftypefun void starpu_task_list_init ({struct starpu_task_list *}@var{list})
															
@@ -182,11 +182,11 @@ Push a task at the front of a list
 
																 Push a task at the back of a list
															
 
																 @end deftypefun
															
 
																-@deftypefun struct starpu_task *starpu_task_list_front ({struct starpu_task_list *}@var{list})
															
 
																+@deftypefun {struct starpu_task *}starpu_task_list_front ({struct starpu_task_list *}@var{list})
															
 
																 Get the front of the list (without removing it)
															
 
																 @end deftypefun
															
 
																-@deftypefun struct starpu_task *starpu_task_list_back ({struct starpu_task_list *}@var{list})
															
 
																+@deftypefun {struct starpu_task *}starpu_task_list_back ({struct starpu_task_list *}@var{list})
															
 
																 Get the back of the list (without removing it)
															
 
																 @end deftypefun
															
@@ -198,14 +198,26 @@ Test if a list is empty
 
																 Remove an element from the list
															
 
																 @end deftypefun
															
 
																-@deftypefun struct starpu_task *starpu_task_list_pop_front ({struct starpu_task_list *}@var{list})
															
 
																+@deftypefun {struct starpu_task *}starpu_task_list_pop_front ({struct starpu_task_list *}@var{list})
															
 
																 Remove the element at the front of the list
															
 
																 @end deftypefun
															
 
																-@deftypefun struct starpu_task *starpu_task_list_pop_back ({struct starpu_task_list *}@var{list})
															
 
																+@deftypefun {struct starpu_task *}starpu_task_list_pop_back ({struct starpu_task_list *}@var{list})
															
 
																 Remove the element at the back of the list
															
 
																 @end deftypefun
															
 
																+@deftypefun {struct starpu_task *}starpu_task_list_begin ({struct starpu_task_list *}@var{list})
															
 
																+Get the first task of the list.
															
 
																+@end deftypefun
															
 
																+
															
 
																+@deftypefun {struct starpu_task *}starpu_task_list_end ({struct starpu_task_list *}@var{list})
															
 
																+Get the end of the list.
															
 
																+@end deftypefun
															
 
																+
															
 
																+@deftypefun {struct starpu_task *}starpu_task_list_next ({struct starpu_task *}@var{task})
															
 
																+Get the next task of the list. This is not erase-safe.
															
 
																+@end deftypefun
															
 
																+
															
 
																 @node Defining a new scheduling policy
															
 
																 @section Defining a new scheduling policy
															
@@ -234,14 +246,13 @@ Initialize the scheduling policy.
 
																 Cleanup the scheduling policy.
															
 
																 @item @code{push_task}
															
 
																 Insert a task into the scheduler.
															
 
																-@item @code{push_prio_task}
															
 
																-Insert a priority task into the scheduler.
															
 
																-@item @code{push_prio_notify}
															
 
																-Notify the scheduler that a task was pushed on the worker. This method is
															
 
																-called when a task that was explicitely assigned to a worker is scheduled. This
															
 
																-method therefore permits to keep the state of of the scheduler coherent even
															
 
																-when StarPU bypasses the scheduling strategy.
															
 
																-@item @code{pop_task}
															
 
																+@item @code{push_task_notify}
															
 
																+Notify the scheduler that a task was pushed on a given worker. This method is
															
 
																+called when a task that was explicitely assigned to a worker becomes ready and
															
 
																+is about to be executed by the worker. This method therefore permits to keep
															
 
																+the state of of the scheduler coherent even when StarPU bypasses the scheduling
															
 
																+strategy.
															
 
																+@item @code{pop_task} (optional)
															
 
																 Get a task from the scheduler. The mutex associated to the worker is already
															
 
																 taken when this method is called. If this method is defined as @code{NULL}, the
															
 
																 worker will only execute tasks from its local queue. In this case, the
															
@@ -250,13 +261,14 @@ assign tasks to the different workers.
 
																 @item @code{pop_every_task}
															
 
																 Remove all available tasks from the scheduler (tasks are chained by the means
															
 
																 of the prev and next fields of the starpu_task structure). The mutex associated
															
 
																-to the worker is already taken when this method is called.
															
 
																+to the worker is already taken when this method is called. This is currently
															
 
																+only used by the Gordon driver.
															
 
																 @item @code{post_exec_hook} (optional)
															
 
																 This method is called every time a task has been executed.
															
 
																 @item @code{policy_name}
															
 
																 Name of the policy (optional).
															
 
																 @item @code{policy_description}
															
 
																-Description of the policy (optionnal).
															
 
																+Description of the policy (optional).
															
 
																 @end table
															
 
																 @end deftp
															
@@ -307,12 +319,12 @@ where the worker will pop tasks first. Setting @var{back} to 0 therefore ensures
 
																 a FIFO ordering.
															
 
																 @end deftypefun
															
 
																-@deftypefun int starpu_worker_may_execute_task (unsigned @var{workerid}, {struct starpu_task *}@var{task}, unsigned {nimpl})
															
 
																-Check if the worker specified by workerid can execute the codelet.
															
 
																+@deftypefun int starpu_worker_may_run_task (unsigned @var{workerid}, {struct starpu_task *}@var{task}, unsigned {nimpl})
															
 
																+Check if the worker specified by workerid can execute the codelet. Schedulers need to call it before assigning a task to a worker, otherwise the task may fail to execute.
															
 
																 @end deftypefun
															
 
																 @deftypefun double starpu_timing_now (void)
															
 
																-Return the current date
															
 
																+Return the current date in µs
															
 
																 @end deftypefun
															
 
																 @deftypefun double starpu_task_expected_length ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
															
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -11,6 +11,7 @@
 
																 @menu
															
 
																 * Using multiple implementations of a codelet::
															
 
																+* Enabling implementation according to capabilities::
															
 
																 * Task and Worker Profiling::   
															
 
																 * Partitioning Data::           Partitioning Data
															
 
																 * Performance model example::   
															
@@ -68,6 +69,50 @@ struct starpu_codelet cl = @{
 
																 Scheduler which are multi-implementation aware (only @code{dmda}, @code{heft}
															
 
																 and @code{pheft} for now) will use the performance models of all the
															
 
																 implementations it was given, and pick the one that seems to be the fastest.
															
 
																+
															
 
																+@node Enabling implementation according to capabilities
															
 
																+@section Enabling implementation according to capabilities
															
 
																+
															
 
																+Some implementations may not run on some devices. For instance, some GPU
															
 
																+devices do not support double floating point precision, and thus the kernel
															
 
																+execution would just fail; or the GPU may not have enough shared memory for
															
 
																+the implementation being used. The @code{can_execute} field of the @code{struct
															
 
																+starpu_codelet} structure permits to express this. For instance:
															
 
																+
															
 
																+@cartouche
															
 
																+@smallexample
															
 
																+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+@{
															
 
																+  const struct cudaDeviceProp *props;
															
 
																+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
															
 
																+    return 1;
															
 
																+  /* Cuda device */
															
 
																+  props = starpu_cuda_get_device_properties(workerid);
															
 
																+  if (props->major >= 2 || props->minor >= 3)
															
 
																+    /* At least compute capability 1.3, supports doubles */
															
 
																+    return 1;
															
 
																+  /* Old card, does not support doubles */
															
 
																+  return 0;
															
 
																+@}
															
 
																+
															
 
																+struct starpu_codelet cl = @{
															
 
																+    .where = STARPU_CPU|STARPU_GPU,
															
 
																+    .can_execute = can_execute,
															
 
																+    .cpu_func = cpu_func,
															
 
																+    .gpu_func = gpu_func
															
 
																+    .nbuffers = 1
															
 
																+@};
															
 
																+@end smallexample
															
 
																+@end cartouche
															
 
																+
															
 
																+This can be essential e.g. when running on a machine which mixes various models
															
 
																+of GPUs, to take benefit from the new models without crashing on old models.
															
 
																+
															
 
																+Note: the @code{can_execute} function is called by the scheduler each time it
															
 
																+tries to match a task with a worker, and should thus be very fast. The
															
 
																+@code{starpu_cuda_get_device_properties} provides a quick access to CUDA
															
 
																+properties of CUDA devices to achieve such efficiency.
															
 
																+
															
 
																 @node Task and Worker Profiling
															
 
																 @section Task and Worker Profiling
															
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -20,6 +20,7 @@
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <cuda.h>
															
 
																 #include <cublas.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 #endif
															
 
																 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
@@ -37,6 +38,20 @@ static unsigned entries_per_block = 1024;
 
																 static DOT_TYPE dot = 0.0f;
															
 
																 static starpu_data_handle_t dot_handle;
															
 
																+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+{
															
 
																+	const struct cudaDeviceProp *props;
															
 
																+	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
															
 
																+		return 1;
															
 
																+	/* Cuda device */
															
 
																+	props = starpu_cuda_get_device_properties(workerid);
															
 
																+	if (props->major >= 2 || props->minor >= 3)
															
 
																+		/* At least compute capability 1.3, supports doubles */
															
 
																+		return 0;
															
 
																+	/* Old card, does not support doubles */
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																 /*
															
 
																  *	Codelet to create a neutral element
															
 
																  */
															
@@ -58,6 +73,7 @@ void init_cuda_func(void *descr[], void *cl_arg)
 
																 static struct starpu_codelet init_codelet = {
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.can_execute = can_execute,
															
 
																 	.cpu_func = init_cpu_func,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_func = init_cuda_func,
															
@@ -83,6 +99,7 @@ extern void redux_cuda_func(void *descr[], void *_args);
 
																 static struct starpu_codelet redux_codelet = {
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.can_execute = can_execute,
															
 
																 	.cpu_func = redux_cpu_func,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_func = redux_cuda_func,
															
@@ -144,6 +161,7 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
																 static struct starpu_codelet dot_codelet = {
															
 
																 	.where = STARPU_CPU|STARPU_CUDA,
															
 
																+	.can_execute = can_execute,
															
 
																 	.cpu_func = dot_cpu_func,
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_func = dot_cuda_func,
															
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -119,7 +119,7 @@ void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond,
 
																 #endif
															
 
																 /* Check if the worker specified by workerid can execute the codelet. */
															
 
																-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
															
 
																+int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
															
 
																 /* The scheduling policy may put tasks directly into a worker's local queue so
															
 
																  * that it is not always necessary to create its own queue when the local queue
															
@@ -154,8 +154,8 @@ void starpu_sched_set_max_priority(int max_prio);
 
																 int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[]);
															
 
																 /* Get the description of a combined worker */
															
 
																 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
															
 
																-/* Variant of starpu_worker_may_execute_task compatible with combined workers */
															
 
																-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
															
 
																+/* Variant of starpu_worker_can_execute_task compatible with combined workers */
															
 
																+int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
															
 
																 /*
															
 
																  *	Data prefetching
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -74,9 +74,11 @@ typedef uint8_t starpu_gordon_func_t; /* Cell SPU */
 
																  * A codelet describes the various function 
															
 
																  * that may be called from a worker
															
 
																  */
															
 
																+struct starpu_task;
															
 
																 struct starpu_codelet {
															
 
																 	/* where can it be performed ? */
															
 
																 	uint32_t where;
															
 
																+	int (*can_execute)(unsigned workerid, struct starpu_task *task, unsigned nimpl);
															
 
																 	unsigned type;
															
 
																 	int max_parallelism;
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -258,7 +258,7 @@ int starpu_task_submit(struct starpu_task *task)
 
																 		/* In case we require that a task should be explicitely
															
 
																 		 * executed on a specific worker, we make sure that the worker
															
 
																 		 * is able to execute this task.  */
															
 
																-		if (task->execute_on_a_specific_worker && !starpu_combined_worker_may_execute_task(task->workerid, task, 0)) {
															
 
																+		if (task->execute_on_a_specific_worker && !starpu_combined_worker_can_execute_task(task->workerid, task, 0)) {
															
 
																                         _STARPU_LOG_OUT_TAG("ENODEV");
															
 
																 			return -ENODEV;
															
 
																                 }
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -54,22 +54,22 @@ uint32_t _starpu_worker_exists(uint32_t task_mask)
 
																 	return (task_mask & config.worker_mask);
															
 
																 } 
															
 
																-uint32_t _starpu_may_submit_cuda_task(void)
															
 
																+uint32_t _starpu_can_submit_cuda_task(void)
															
 
																 {
															
 
																 	return (STARPU_CUDA & config.worker_mask);
															
 
																 }
															
 
																-uint32_t _starpu_may_submit_cpu_task(void)
															
 
																+uint32_t _starpu_can_submit_cpu_task(void)
															
 
																 {
															
 
																 	return (STARPU_CPU & config.worker_mask);
															
 
																 }
															
 
																-uint32_t _starpu_may_submit_opencl_task(void)
															
 
																+uint32_t _starpu_can_submit_opencl_task(void)
															
 
																 {
															
 
																 	return (STARPU_OPENCL & config.worker_mask);
															
 
																 }
															
 
																-static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
															
 
																+static int _starpu_can_use_nth_implementation(enum starpu_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
															
 
																 {
															
 
																 	switch(arch) {
															
 
																 	case STARPU_CPU_WORKER:
															
@@ -94,18 +94,17 @@ static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct
 
																 }
															
 
																-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																 {
															
 
																 	/* TODO: check that the task operand sizes will fit on that device */
															
 
																-	/* TODO: call application-provided function for various cases like
															
 
																-	 * double support, shared memory size limit, etc. */
															
 
																-	return ((task->cl->where & config.workers[workerid].worker_mask) &&
															
 
																-		_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
															
 
																+	return (task->cl->where & config.workers[workerid].worker_mask) &&
															
 
																+		_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl) &&
															
 
																+		(!task->cl->can_execute || task->cl->can_execute(workerid, task, nimpl));
															
 
																 }
															
 
																-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																 {
															
 
																 	/* TODO: check that the task operand sizes will fit on that device */
															
 
																 	/* TODO: call application-provided function for various cases like
															
@@ -118,7 +117,7 @@ int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_tas
 
																 	if (workerid < nworkers)
															
 
																 	{
															
 
																 		return !!((task->cl->where & config.workers[workerid].worker_mask) &&
															
 
																-				_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
															
 
																+				_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
															
 
																 	}
															
 
																 	else {
															
 
																 		if ((cl->type == STARPU_SPMD) || (cl->type == STARPU_FORKJOIN))
															
@@ -128,7 +127,7 @@ int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_tas
 
																 			/* Is the worker larger than requested ? */
															
 
																 			int worker_size = (int)config.combined_workers[workerid - nworkers].worker_size;
															
 
																 			return !!((worker_size <= task->cl->max_parallelism) &&
															
 
																-				_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
															
 
																+				_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -162,13 +162,13 @@ unsigned _starpu_machine_is_running(void);
 
																 uint32_t _starpu_worker_exists(uint32_t task_mask);
															
 
																 /* Is there a worker that can execute CUDA code ? */
															
 
																-uint32_t _starpu_may_submit_cuda_task(void);
															
 
																+uint32_t _starpu_can_submit_cuda_task(void);
															
 
																 /* Is there a worker that can execute CPU code ? */
															
 
																-uint32_t _starpu_may_submit_cpu_task(void);
															
 
																+uint32_t _starpu_can_submit_cpu_task(void);
															
 
																 /* Is there a worker that can execute OpenCL code ? */
															
 
																-uint32_t _starpu_may_submit_opencl_task(void);
															
 
																+uint32_t _starpu_can_submit_opencl_task(void);
															
 
																 /* Check whether there is anything that the worker should do instead of
															
 
																  * sleeping (waiting on something to happen). */
															
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -297,6 +297,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 			&sched_mutex[best_workerid], &sched_cond[best_workerid], task);
															
 
																 }
															
 
																+/* TODO: factorize with dmda!! */
															
 
																 static int _dm_push_task(struct starpu_task *task, unsigned prio)
															
 
																 {
															
 
																 	/* find the queue */
															
@@ -316,10 +317,9 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 
																 	unsigned best_impl = 0;
															
 
																 	unsigned nimpl;
															
 
																-	for (worker = 0; worker < nworkers; worker++)
															
 
																-	{
															
 
																-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																-		{
															
 
																+
															
 
																+	for (worker = 0; worker < nworkers; worker++) {
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
															
 
																 			double exp_end;
															
 
																 			fifo = queue_array[worker];
															
@@ -328,7 +328,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 
																 			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
															
 
																 			fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
@@ -347,6 +347,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 
																 					) {
															
 
																 				ntasks_best_end = ntasks_end;
															
 
																 				ntasks_best = worker;
															
 
																+				best_impl = nimpl;
															
 
																 			}
															
 
																 			if (local_length == -1.0)
															
@@ -400,13 +401,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
																 	   there is no performance prediction available yet */
															
 
																 	int forced_best = -1;
															
 
																-	double local_task_length[nworkers];
															
 
																-	double local_data_penalty[nworkers];
															
 
																-	double local_power[nworkers];
															
 
																-	double exp_end[nworkers];
															
 
																+	double local_task_length[nworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_data_penalty[nworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_power[nworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double exp_end[nworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	double max_exp_end = 0.0;
															
 
																-	double fitness[nworkers];
															
 
																+	double fitness[nworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	double best_exp_end = 10e240;
															
 
																 	double model_best = 0.0;
															
@@ -420,11 +421,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
																 	int unknown = 0;
															
 
																 	unsigned best_impl = 0;
															
 
																-	unsigned nimpl=0;
															
 
																-	for (worker = 0; worker < nworkers; worker++)
															
 
																-	{
															
 
																-		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																-	 	{
															
 
																+	unsigned nimpl;
															
 
																+
															
 
																+	for (worker = 0; worker < nworkers; worker++) {
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
															
 
																 			fifo = queue_array[worker];
															
 
																 			/* Sometimes workers didn't take the tasks as early as we expected */
															
@@ -433,39 +433,39 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
																 			if (fifo->exp_end > max_exp_end)
															
 
																 				max_exp_end = fifo->exp_end;
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
 
																 			}
															
 
																 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																-			local_task_length[worker] = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																+			local_task_length[worker][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																-			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
															
 
																+			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker][nimpl],worker,nimpl);
															
 
																 			unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																-			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																+			local_data_penalty[worker][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
															
 
																 			if (ntasks_best == -1
															
 
																 					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+					|| (!calibrating && local_task_length[worker][nimpl] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																+					|| (calibrating && local_task_length[worker][nimpl] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																 					) {
															
 
																 				ntasks_best_end = ntasks_end;
															
 
																 				ntasks_best = worker;
															
 
																-
															
 
																+				best_impl = nimpl;
															
 
																 			}
															
 
																-			if (local_task_length[worker] == -1.0)
															
 
																+			if (local_task_length[worker][nimpl] == -1.0)
															
 
																 				/* we are calibrating, we want to speed-up calibration time
															
 
																 				 * so we privilege non-calibrated tasks (but still
															
 
																 				 * greedily distribute them to avoid dumb schedules) */
															
 
																 				calibrating = 1;
															
 
																-			if (local_task_length[worker] <= 0.0)
															
 
																+			if (local_task_length[worker][nimpl] <= 0.0)
															
 
																 				/* there is no prediction available for that task
															
 
																 				 * with that arch yet, so switch to a greedy strategy */
															
 
																 				unknown = 1;
															
@@ -473,22 +473,18 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
																 			if (unknown)
															
 
																 					continue;
															
 
																-			exp_end[worker] = fifo->exp_start + fifo->exp_len + local_task_length[worker];
															
 
																+			exp_end[worker][nimpl] = fifo->exp_start + fifo->exp_len + local_task_length[worker][nimpl];
															
 
																-			if (exp_end[worker] < best_exp_end)
															
 
																+			if (exp_end[worker][nimpl] < best_exp_end)
															
 
																 			{
															
 
																 				/* a better solution was found */
															
 
																-				best_exp_end = exp_end[worker];
															
 
																+				best_exp_end = exp_end[worker][nimpl];
															
 
																 				best_impl = nimpl;
															
 
																-
															
 
																 			}
															
 
																-
															
 
																-
															
 
																-			local_power[worker] = starpu_task_expected_power(task, perf_arch, nimpl);
															
 
																-			if (local_power[worker] == -1.0)
															
 
																-				local_power[worker] = 0.;
															
 
																-
															
 
																+			local_power[worker][nimpl] = starpu_task_expected_power(task, perf_arch, nimpl);
															
 
																+			if (local_power[worker][nimpl] == -1.0)
															
 
																+				local_power[worker][nimpl] = 0.;
															
 
																 		 }
															
 
																 	}
															
@@ -501,30 +497,33 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
																 	if (forced_best == -1)
															
 
																 	{
															
 
																 		for (worker = 0; worker < nworkers; worker++)
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 		{
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, 0))
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
 
																 			}
															
 
																-			fitness[worker] = alpha*(exp_end[worker] - best_exp_end) 
															
 
																-					+ beta*(local_data_penalty[worker])
															
 
																-					+ _gamma*(local_power[worker]);
															
 
																+			fitness[worker][nimpl] = alpha*(exp_end[worker][nimpl] - best_exp_end) 
															
 
																+				+ beta*(local_data_penalty[worker][nimpl])
															
 
																+				+ _gamma*(local_power[worker][nimpl]);
															
 
																-			if (exp_end[worker] > max_exp_end)
															
 
																+			if (exp_end[worker][nimpl] > max_exp_end) {
															
 
																 				/* This placement will make the computation
															
 
																 				 * longer, take into account the idle
															
 
																 				 * consumption of other cpus */
															
 
																-				fitness[worker] += _gamma * idle_power * (exp_end[worker] - max_exp_end) / 1000000.0;
															
 
																+				fitness[worker][nimpl] += _gamma * idle_power * (exp_end[worker][nimpl] - max_exp_end) / 1000000.0;
															
 
																+			}
															
 
																-			if (best == -1 || fitness[worker] < best_fitness)
															
 
																+			if (best == -1 || fitness[worker][nimpl] < best_fitness)
															
 
																 			{
															
 
																 				/* we found a better solution */
															
 
																-				best_fitness = fitness[worker];
															
 
																+				best_fitness = fitness[worker][nimpl];
															
 
																 				best = worker;
															
 
																+				best_impl = nimpl;
															
 
																-	//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker], local_power[worker]);
															
 
																+				//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_power[worker][nimpl]);
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
@@ -542,8 +541,8 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 
																 	}
															
 
																 	else 
															
 
																 	{
															
 
																-		model_best = local_task_length[best];
															
 
																-		//penality_best = local_data_penalty[best];
															
 
																+		model_best = local_task_length[best][nimpl];
															
 
																+		//penality_best = local_data_penalty[best][nimpl];
															
 
																 	}
															
--- a/src/sched_policies/deque_queues.c
+++ b/src/sched_policies/deque_queues.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
@@ -68,17 +68,22 @@ struct starpu_task *_starpu_deque_pop_task(struct _starpu_deque_jobq *deque_queu
 
																 	}
															
 
																 	/* TODO find a task that suits workerid */
															
 
																-	if (deque_queue->njobs > 0) 
															
 
																+	for (j  = starpu_job_list_begin(deque_queue->jobq);
															
 
																+	     j != starpu_job_list_end(deque_queue->jobq);
															
 
																+	     j  = starpu_job_list_next(j))
															
 
																 	{
															
 
																-		/* there is a task */
															
 
																-		j = starpu_job_list_pop_front(deque_queue->jobq);
															
 
																-	
															
 
																+		unsigned nimpl;
															
 
																 		STARPU_ASSERT(j);
															
 
																-		deque_queue->njobs--;
															
 
																-		
															
 
																-		_STARPU_TRACE_JOB_POP(j, 0);
															
 
																-		return j->task;
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			if (starpu_worker_can_execute_task(workerid, j->task, nimpl))
															
 
																+			{
															
 
																+				j->nimpl = nimpl;
															
 
																+				j = starpu_job_list_pop_front(deque_queue->jobq);
															
 
																+				deque_queue->njobs--;
															
 
																+				_STARPU_TRACE_JOB_POP(j, 0);
															
 
																+				return j->task;
															
 
																+			}
															
 
																 	}
															
 
																 	return NULL;
															
@@ -110,19 +115,18 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct _starpu_deque_jobq
 
																 			i != starpu_job_list_end(old_list);
															
 
																 			i  = next_job)
															
 
																 		{
															
 
																+			unsigned nimpl;
															
 
																 			next_job = starpu_job_list_next(i);
															
 
																-			/* In case there are multiples implementations of the
															
 
																- 			 * codelet for a single device, We dont really care
															
 
																-			 * about the implementation used, so let's try the 
															
 
																-			 * first one. */
															
 
																-			if (starpu_worker_may_execute_task(workerid, i->task, 0))
															
 
																+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			if (starpu_worker_can_execute_task(workerid, i->task, nimpl))
															
 
																 			{
															
 
																 				/* this elements can be moved into the new list */
															
 
																 				new_list_size++;
															
 
																 				starpu_job_list_erase(old_list, i);
															
 
																 				starpu_job_list_push_back(new_list, i);
															
 
																+				i->nimpl = nimpl;
															
 
																 			}
															
 
																 		}
															
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -46,7 +46,7 @@ static void initialize_eager_center_policy(struct starpu_machine_topology *topol
 
																 static void deinitialize_eager_center_policy(__attribute__ ((unused)) struct starpu_machine_topology *topology, 
															
 
																 		   __attribute__ ((unused)) struct starpu_sched_policy *_policy) 
															
 
																 {
															
 
																-	/* TODO check that there is no task left in the queue */
															
 
																+	STARPU_ASSERT(!_starpu_fifo_pop_task(fifo, starpu_worker_get_id()));
															
 
																 	/* deallocate the job queue */
															
 
																 	_starpu_destroy_fifo(fifo);
															
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -122,6 +122,7 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 
																 static struct starpu_task *_starpu_priority_pop_task(void)
															
 
																 {
															
 
																+	/* XXX FIXME: should call starpu_worker_can_execute_task!! */
															
 
																 	struct starpu_task *task = NULL;
															
 
																 	/* block until some event happens */
															
@@ -150,6 +151,7 @@ static struct starpu_task *_starpu_priority_pop_task(void)
 
																 			}
															
 
																 		} while (!task && priolevel-- > 0);
															
 
																 	}
															
 
																+	STARPU_ASSERT(starpu_worker_can_execute_task(starpu_worker_get_id(), task, 0) || !"prio does not support \"can_execute\"");
															
 
																 	_STARPU_PTHREAD_MUTEX_UNLOCK(&global_sched_mutex);
															
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -48,7 +48,6 @@ void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo)
 
																 }
															
 
																 /* TODO: revert front/back? */
															
 
																-
															
 
																 int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
															
 
																 {
															
 
																 	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
@@ -65,26 +64,29 @@ int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_
 
																 	return 0;
															
 
																 }
															
 
																-struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid __attribute__ ((unused)))
															
 
																+struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
															
 
																 {
															
 
																-	struct starpu_task *task = NULL;
															
 
																-
															
 
																-	if (fifo_queue->ntasks == 0)
															
 
																-		return NULL;
															
 
																+	struct starpu_task *task;
															
 
																-	/* TODO: find a task that suits workerid */
															
 
																-	if (fifo_queue->ntasks > 0) 
															
 
																+	for (task  = starpu_task_list_begin(&fifo_queue->taskq);
															
 
																+	     task != starpu_task_list_end(&fifo_queue->taskq);
															
 
																+	     task  = starpu_task_list_next(task))
															
 
																 	{
															
 
																-		/* there is a task */
															
 
																-		task = starpu_task_list_pop_back(&fifo_queue->taskq);
															
 
																-	
															
 
																+		unsigned nimpl;
															
 
																 		STARPU_ASSERT(task);
															
 
																-		fifo_queue->ntasks--;
															
 
																-		
															
 
																-		_STARPU_TRACE_JOB_POP(task, 0);
															
 
																+
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			if (starpu_worker_can_execute_task(workerid, task, nimpl))
															
 
																+			{
															
 
																+				_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
															
 
																+				starpu_task_list_erase(&fifo_queue->taskq, task);
															
 
																+				fifo_queue->ntasks--;
															
 
																+				_STARPU_TRACE_JOB_POP(task, 0);
															
 
																+				return task;
															
 
																+			}
															
 
																 	}
															
 
																-	return task;
															
 
																+	return NULL;
															
 
																 }
															
 
																 /* pop every task that can be executed on the calling driver */
															
@@ -110,9 +112,11 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo_
 
																 		task = starpu_task_list_front(old_list);
															
 
																 		while (task)
															
 
																 		{
															
 
																+			unsigned nimpl;
															
 
																 			next_task = task->next;
															
 
																-			if (starpu_worker_may_execute_task(workerid, task, 0))
															
 
																+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			if (starpu_worker_can_execute_task(workerid, task, nimpl))
															
 
																 			{
															
 
																 				/* this elements can be moved into the new list */
															
 
																 				new_list_size++;
															
@@ -132,6 +136,8 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo_
 
																 					task->prev = NULL;
															
 
																 					task->next = NULL;
															
 
																 				}
															
 
																+				_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
															
 
																+				break;
															
 
																 			}
															
 
																 			task = next_task;
															
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -208,6 +208,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 	return starpu_push_local_task(best_workerid, task, prio);
															
 
																 }
															
 
																+/* TODO: factorize with dmda!! */
															
 
																 static void compute_all_performance_predictions(struct starpu_task *task,
															
 
																 					double local_task_length[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
															
 
																 					double exp_end[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
															
@@ -232,14 +233,14 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
																 	unsigned nimpl;
															
 
																 	for (worker = 0; worker < nworkers; worker++) {
															
 
																-		for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) {
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
															
 
																 			/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																 			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
															
 
																 			exp_end[worker][nimpl] = exp_start[worker] + exp_len[worker];
															
 
																 			if (exp_end[worker][nimpl] > max_exp_end)
															
 
																 				max_exp_end = exp_end[worker][nimpl];
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
@@ -298,8 +299,11 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
																 			exp_end[worker][nimpl] = exp_start[worker] + exp_len[worker] + local_task_length[worker][nimpl];
															
 
																 			if (exp_end[worker][nimpl] < best_exp_end)
															
 
																+			{
															
 
																 				/* a better solution was found */
															
 
																 				best_exp_end = exp_end[worker][nimpl];
															
 
																+				nimpl_best = nimpl;
															
 
																+			}
															
 
																 			if (local_power[worker][nimpl] == -1.0)
															
 
																 				local_power[worker][nimpl] = 0.;
															
@@ -365,7 +369,7 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio)
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
--- a/src/sched_policies/parallel_greedy.c
+++ b/src/sched_policies/parallel_greedy.c
@@ -169,7 +169,7 @@ static struct starpu_task *pop_task_pgreedy_policy(void)
 
																 			if (possible_combinations_size[workerid][i] > best_size)
															
 
																 			{
															
 
																 				int combined_worker = possible_combinations[workerid][i];
															
 
																-				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
															
 
																+				if (starpu_combined_worker_can_execute_task(combined_worker, task, 0))
															
 
																 				{
															
 
																 					best_size = possible_combinations_size[workerid][i];
															
 
																 					best_workerid = combined_worker;
															
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -238,7 +238,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
																 	{
															
 
																 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 		{
															
 
																-			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_combined_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				skip_worker[worker][nimpl] = 1;
															
--- a/src/util/malloc.c
+++ b/src/util/malloc.c
@@ -74,7 +74,7 @@ int starpu_malloc(void **A, size_t dim)
 
																 	STARPU_ASSERT(A);
															
 
																-	if (_starpu_may_submit_cuda_task())
															
 
																+	if (_starpu_can_submit_cuda_task())
															
 
																 	{
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		int push_res;
															
@@ -98,7 +98,7 @@ int starpu_malloc(void **A, size_t dim)
 
																 		STARPU_ASSERT(push_res != -ENODEV);
															
 
																 #endif
															
 
																 	}
															
 
																-//	else if (_starpu_may_submit_opencl_task())
															
 
																+//	else if (_starpu_can_submit_opencl_task())
															
 
																 //	{
															
 
																 //#ifdef STARPU_USE_OPENCL
															
 
																 //		int push_res;
															
@@ -171,7 +171,7 @@ int starpu_free(void *A)
 
																 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
															
 
																 		return -EDEADLK;
															
 
																-	if (_starpu_may_submit_cuda_task())
															
 
																+	if (_starpu_can_submit_cuda_task())
															
 
																 	{
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		int push_res;
															
@@ -190,7 +190,7 @@ int starpu_free(void *A)
 
																 		STARPU_ASSERT(push_res != -ENODEV);
															
 
																 #endif
															
 
																 	}
															
 
																-//	else if (_starpu_may_submit_opencl_task())
															
 
																+//	else if (_starpu_can_submit_opencl_task())
															
 
																 //	{
															
 
																 //#ifdef STARPU_USE_OPENCL
															
 
																 //		int push_res;