14 年之前 · bb184c7ae9
--- a/simple_ex/Makefile
+++ b/simple_ex/Makefile
@@ -1,29 +0,0 @@
 
				-PROG=exemple
			
 
				-CUDA_SDK_ROOT=/usr/local/cuda/sdk/C
			
 
				-CUDA_HOME=/usr/local/cuda
			
 
				-
			
 
				-.PHONY: all
			
 
				-
			
 
				-all: $(PROG)
			
 
				-
			
 
				-
			
 
				-CC      := gcc
			
 
				-NVCC    := /usr/local/cuda/bin/nvcc
			
 
				-CFLAGS  := $$(pkg-config --cflags libstarpu) -g #-Wall
			
 
				-LDFLAGS := $$(pkg-config --libs libstarpu)
			
 
				-CUDADIR=$(CUDA_HOME)
			
 
				-
			
 
				-COMMONFLAGS += -I. -I$(CUDADIR)/include  -I$(CUDA_SDK_ROOT)/common/inc -DUNIX -g -Xcompiler
			
 
				-NVCCFLAGS +=  -I$(CUDA_SDK_ROOT)/common/inc -I. -G
			
 
				-
			
 
				-%.o: %.cu
			
 
				-	$(NVCC) $(CFLAGS) $(COMMONFLAGS) $(NVCCFLAGS) -o $@ -c $< 
			
 
				-
			
 
				-%.o: %.c
			
 
				-	$(CC) $(CFLAGS) -o $@ -c $< 
			
 
				-
			
 
				-$(PROG): %: %.o %_kernel.o
			
 
				-	$(CC) -o $@ $^ $(LDFLAGS) -L$(CUDADIR)/lib -lcudart
			
 
				-
			
 
				-clean:
			
 
				-	rm -f $(PROG) *.o
			
--- a/simple_ex/exemple.c
+++ b/simple_ex/exemple.c
@@ -1,160 +0,0 @@
 
				-#include <stdio.h>
			
 
				-#include <malloc.h>
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-static inline void my_codelet_cpu(void *descr[], void *_args)
			
 
				-{
			
 
				-  unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-  float *sub = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-
			
 
				-  unsigned i;
			
 
				-
			
 
				-  for(i = 0; i < nx; i++){
			
 
				-    sub[i] *= 5;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-extern void my_codelet_gpu(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				-
			
 
				-
			
 
				-static starpu_codelet cl = 
			
 
				-  {
			
 
				-    .where = STARPU_CPU|STARPU_CUDA,
			
 
				-    .cpu_func = my_codelet_cpu,
			
 
				-    .cuda_func = my_codelet_gpu,
			
 
				-    .nbuffers = 1
			
 
				-  };
			
 
				-
			
 
				-void print_vect(int *vect, int size){
			
 
				-  unsigned i;
			
 
				-  for(i = 0; i < size; i++)
			
 
				-    printf("%d ", vect[i]);
			
 
				-  printf("\n");
			
 
				-  
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-  srand(time(NULL));
			
 
				-  int *mat;
			
 
				-  unsigned size = 20, children = 5;
			
 
				-  mat = (int *)malloc(size*sizeof(int));
			
 
				-
			
 
				-  unsigned i;
			
 
				-  for(i = 0; i < size; i++)
			
 
				-    {
			
 
				-      mat[i] = random()% 10 + 1;
			
 
				-    }
			
 
				-
			
 
				-  print_vect(mat, size);
			
 
				-
			
 
				-  //  struct starpu_conf conf;
			
 
				-  //conf.sched_policy_name = "heft-tm";
			
 
				-  //conf.ncpus = -1;
			
 
				-  // printf("got here \n");
			
 
				-  starpu_init(NULL);
			
 
				-
			
 
				-  for(i = 0; i < 12; i++)
			
 
				-    printf("%d: arch is %d\n", starpu_worker_get_type(i));
			
 
				-
			
 
				-  starpu_data_handle dataA;
			
 
				-  starpu_vector_data_register(&dataA, 0, (uintptr_t)mat, size, sizeof(mat[00]));
			
 
				-
			
 
				-  struct starpu_data_filter f =
			
 
				-    {
			
 
				-      .filter_func = starpu_block_filter_func_vector,
			
 
				-      .nchildren = children,
			
 
				-      .get_nchildren = NULL,
			
 
				-      .get_child_ops = NULL
			
 
				-    };
			
 
				-  starpu_data_partition(dataA, &f);
			
 
				-
			
 
				-  struct starpu_sched_ctx sched_ctx;
			
 
				-  int procs[] = {1, 2, 3};
			
 
				-  starpu_create_sched_ctx(&sched_ctx, "heft", procs, 3);
			
 
				-
			
 
				-  unsigned j;
			
 
				-  for(j = 0; j < children; j++){
			
 
				-    struct starpu_task *task = starpu_task_create();
			
 
				-    task->cl = &cl;
			
 
				-    task->synchronous = 1;
			
 
				-    task->callback_func = NULL;
			
 
				-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
			
 
				-    task->buffers[0].mode = STARPU_RW;
			
 
				-    task->name = "first 1 2 3";  
			
 
				-    starpu_task_submit_to_ctx(task, &sched_ctx);
			
 
				-  }
			
 
				-
			
 
				-  int procs_to_remove[]={1,3};
			
 
				-  starpu_remove_workers_from_sched_ctx(procs_to_remove, 2, &sched_ctx);
			
 
				-
			
 
				-  printf("procs removed \n");
			
 
				-
			
 
				-  for(j = 0; j < children; j++){
			
 
				-    struct starpu_task *task = starpu_task_create();
			
 
				-    task->cl = &cl;
			
 
				-    task->synchronous = 1;
			
 
				-    task->callback_func = NULL;
			
 
				-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
			
 
				-    task->buffers[0].mode = STARPU_RW;
			
 
				-    task->name = "first 2";  
			
 
				-    starpu_task_submit_to_ctx(task, &sched_ctx);
			
 
				-  }
			
 
				-
			
 
				-  int procs_to_add[]={1, 4, 5};
			
 
				-  starpu_add_workers_to_sched_ctx(procs_to_add, 2, &sched_ctx);
			
 
				-
			
 
				-  printf("procs add \n");
			
 
				-
			
 
				-  for(j = 0; j < children; j++){
			
 
				-    struct starpu_task *task = starpu_task_create();
			
 
				-    task->cl = &cl;
			
 
				-    task->synchronous = 1;
			
 
				-    task->callback_func = NULL;
			
 
				-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
			
 
				-    task->buffers[0].mode = STARPU_RW;
			
 
				-    task->name = "first 1 2 4 5";  
			
 
				-    starpu_task_submit_to_ctx(task, &sched_ctx);
			
 
				-  }
			
 
				-
			
 
				-
			
 
				-  struct starpu_sched_ctx sched_ctx2;
			
 
				-  int procs2[]={3, 4, 5, 6, 7};
			
 
				-  starpu_create_sched_ctx(&sched_ctx2, "random", procs2, 5);
			
 
				-
			
 
				-  for(j = 0; j < children; j++){
			
 
				-    struct starpu_task *task3 = starpu_task_create();
			
 
				-    task3->cl = &cl;
			
 
				-    task3->synchronous = 1;
			
 
				-    task3->callback_func = NULL;
			
 
				-    task3->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
			
 
				-    task3->buffers[0].mode = STARPU_RW;
			
 
				-    task3->name = "third 3 4 5 6 7";
			
 
				-    starpu_task_submit_to_ctx(task3, &sched_ctx2);
			
 
				-  }
			
 
				-
			
 
				-  for(j = 0; j < children; j++){
			
 
				-    struct starpu_task *task2 = starpu_task_create();
			
 
				-    task2->cl = &cl;
			
 
				-    task2->synchronous = 1;
			
 
				-    task2->callback_func = NULL;
			
 
				-    task2->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
			
 
				-    task2->buffers[0].mode = STARPU_RW;
			
 
				-    task2->name = "anything";
			
 
				-    starpu_task_submit(task2);
			
 
				-  }
			
 
				-  
			
 
				-  printf("wait for all \n");
			
 
				-  starpu_task_wait_for_all();
			
 
				-  starpu_data_unpartition(dataA, 0);
			
 
				-
			
 
				-  printf("data unregister  \n");
			
 
				-  starpu_data_unregister(dataA);
			
 
				-  
			
 
				-  printf("the end \n");
			
 
				-  starpu_shutdown();
			
 
				-
			
 
				-  print_vect(mat, size);
			
 
				-  
			
 
				-  return 0;
			
 
				-}
			
--- a/simple_ex/exemple_kernel.cu
+++ b/simple_ex/exemple_kernel.cu
@@ -1,22 +0,0 @@
 
				-#include <starpu.h>
			
 
				-#include <starpu_cuda.h>
			
 
				-
			
 
				-static __global__ void myf(int *dMatA)
			
 
				-{
			
 
				-  int tidy = threadIdx.y;
			
 
				-
			
 
				-  dMatA[ tidy ] = dMatA[ tidy ]  * 5;
			
 
				-}
			
 
				-
			
 
				-extern "C" void my_codelet_gpu(void *descr[], void *_args)
			
 
				-{
			
 
				-  unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-  int *sub = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-
			
 
				-  dim3 dimGrid(1,1);
			
 
				-  dim3 dimBlock(nx,nx);
			
 
				-
			
 
				-  myf<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(sub);
			
 
				- 
			
 
				-  cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-}
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -328,59 +329,67 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_
 
				 	/* A priori, we know all estimations */
			
 
				 	int unknown = 0;
			
 
				 
			
 
				+	unsigned best_impl = 0;
			
 
				+	unsigned nimpl;
			
 
				 	unsigned nworkers = sched_ctx->nworkers_in_ctx;
			
 
				 	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
			
 
				 	{
			
 
				-        worker = sched_ctx->workerid[worker_in_ctx];
			
 
				-		double exp_end;
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+		{
			
 
				+        	worker = sched_ctx->workerid[worker_in_ctx];
			
 
				+			double exp_end;
			
 
				 		
			
 
				-		fifo = dt->queue_array[worker_in_ctx];
			
 
				+			fifo = dt->queue_array[worker_in_ctx];
			
 
				 
			
 
				-		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				+			/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				-		if (!starpu_worker_may_execute_task(worker, task))
			
 
				-		{
			
 
				-			/* no one on that queue may execute this task */
			
 
				-			continue;
			
 
				-		}
			
 
				+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				continue;
			
 
				+			}
			
 
				 
			
 
				-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-		double local_length = starpu_task_expected_length(task, perf_arch);
			
 
				-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-
			
 
				-		if (ntasks_best == -1
			
 
				-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-				|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-				|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-				) {
			
 
				-			ntasks_best_end = ntasks_end;
			
 
				-			ntasks_best = worker;
			
 
				-		}
			
 
				+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				-		if (local_length == -1.0)
			
 
				-			/* we are calibrating, we want to speed-up calibration time
			
 
				-			 * so we privilege non-calibrated tasks (but still
			
 
				-			 * greedily distribute them to avoid dumb schedules) */
			
 
				-			calibrating = 1;
			
 
				+			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
			
 
				 
			
 
				-		if (local_length <= 0.0)
			
 
				-			/* there is no prediction available for that task
			
 
				-			 * with that arch yet, so switch to a greedy strategy */
			
 
				-			unknown = 1;
			
 
				+			if (ntasks_best == -1
			
 
				+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+					|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				+					|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+					) {
			
 
				+				ntasks_best_end = ntasks_end;
			
 
				+				ntasks_best = worker;
			
 
				+			}
			
 
				 
			
 
				-		if (unknown)
			
 
				-			continue;
			
 
				+			if (local_length == -1.0)
			
 
				+				/* we are calibrating, we want to speed-up calibration time
			
 
				+				 * so we privilege non-calibrated tasks (but still
			
 
				+				 * greedily distribute them to avoid dumb schedules) */
			
 
				+				calibrating = 1;
			
 
				 
			
 
				-		exp_end = fifo->exp_start + fifo->exp_len + local_length;
			
 
				+			if (local_length <= 0.0)
			
 
				+				/* there is no prediction available for that task
			
 
				+				 * with that arch yet, so switch to a greedy strategy */
			
 
				+				unknown = 1;
			
 
				 
			
 
				-		if (best == -1 || exp_end < best_exp_end)
			
 
				-		{
			
 
				-			/* a better solution was found */
			
 
				-			best_exp_end = exp_end;
			
 
				-			best = worker;
			
 
				-			model_best = local_length;
			
 
				+			if (unknown)
			
 
				+				continue;
			
 
				+
			
 
				+			exp_end = fifo->exp_start + fifo->exp_len + local_length;
			
 
				+
			
 
				+			if (best == -1 || exp_end < best_exp_end)
			
 
				+			{
			
 
				+				/* a better solution was found */
			
 
				+				best_exp_end = exp_end;
			
 
				+				best = worker;
			
 
				+				model_best = local_length;
			
 
				+				best_impl = nimpl;
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -390,6 +399,11 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_
 
				 	}
			
 
				 	
			
 
				 	_starpu_increment_nsubmitted_tasks_of_worker(best);
			
 
				+
			
 
				+	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
			
 
				+
			
 
				+	 _starpu_get_job_associated_to_task(task)->nimpl = 0;//best_impl;
			
 
				+
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx);
			
 
				 }
			
@@ -426,82 +440,89 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 	/* A priori, we know all estimations */
			
 
				 	int unknown = 0;
			
 
				 
			
 
				+	unsigned best_impl = 0;
			
 
				+	unsigned nimpl=0;
			
 
				 	for (worker_in_ctx = 0; worker_in_ctx < nworkers_in_ctx; worker_in_ctx++)
			
 
				 	{
			
 
				         worker = sched_ctx->workerid[worker_in_ctx];
			
 
				+		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+	 	{
			
 
				+			fifo = dt->queue_array[worker_in_ctx];
			
 
				 
			
 
				-		fifo = dt->queue_array[worker_in_ctx];
			
 
				+			/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				+			if (fifo->exp_end > max_exp_end)
			
 
				+				max_exp_end = fifo->exp_end;
			
 
				 
			
 
				-		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				-		if (fifo->exp_end > max_exp_end)
			
 
				-			max_exp_end = fifo->exp_end;
			
 
				+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				continue;
			
 
				+			}
			
 
				 
			
 
				-		if (!starpu_worker_may_execute_task(worker, task))
			
 
				-		{
			
 
				-			/* no one on that queue may execute this task */
			
 
				-			continue;
			
 
				-		}
			
 
				+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
			
 
				 
			
 
				-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-		local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
			
 
				+			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
			
 
				 
			
 
				-		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-		local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				+			local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				-		if (ntasks_best == -1
			
 
				-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-				) {
			
 
				-			ntasks_best_end = ntasks_end;
			
 
				-			ntasks_best = worker;
			
 
				-		}
			
 
				+			if (ntasks_best == -1
			
 
				+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+					) {
			
 
				+				ntasks_best_end = ntasks_end;
			
 
				+				ntasks_best = worker;
			
 
				+			}
			
 
				 
			
 
				-		if (local_task_length[worker_in_ctx] == -1.0)
			
 
				-			/* we are calibrating, we want to speed-up calibration time
			
 
				-			 * so we privilege non-calibrated tasks (but still
			
 
				-			 * greedily distribute them to avoid dumb schedules) */
			
 
				-			calibrating = 1;
			
 
				+			if (local_task_length[worker_in_ctx] == -1.0)
			
 
				+				/* we are calibrating, we want to speed-up calibration time
			
 
				+			 	* so we privilege non-calibrated tasks (but still
			
 
				+			 	* greedily distribute them to avoid dumb schedules) */
			
 
				+				calibrating = 1;
			
 
				 
			
 
				-		if (local_task_length[worker_in_ctx] <= 0.0)
			
 
				-			/* there is no prediction available for that task
			
 
				-			 * with that arch yet, so switch to a greedy strategy */
			
 
				-			unknown = 1;
			
 
				+			if (local_task_length[worker_in_ctx] <= 0.0)
			
 
				+				/* there is no prediction available for that task
			
 
				+			 	* with that arch yet, so switch to a greedy strategy */
			
 
				+				unknown = 1;
			
 
				 
			
 
				-		if (unknown)
			
 
				-			continue;
			
 
				+			if (unknown)
			
 
				+				continue;
			
 
				 
			
 
				-		exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
			
 
				+			exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
			
 
				 
			
 
				-		if (exp_end[worker_in_ctx] < best_exp_end)
			
 
				-		{
			
 
				-			/* a better solution was found */
			
 
				-			best_exp_end = exp_end[worker_in_ctx];
			
 
				-		}
			
 
				+			if (exp_end[worker_in_ctx] < best_exp_end)
			
 
				+			{
			
 
				+				/* a better solution was found */
			
 
				+				best_exp_end = exp_end[worker_in_ctx];
			
 
				+				best_impl = nimpl;
			
 
				+			}
			
 
				 
			
 
				-		local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch);
			
 
				-		if (local_power[worker_in_ctx] == -1.0)
			
 
				-			local_power[worker_in_ctx] = 0.;
			
 
				-	}
			
 
				+			local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
			
 
				+			if (local_power[worker_in_ctx] == -1.0)
			
 
				+				local_power[worker_in_ctx] = 0.;
			
 
				+			}	
			
 
				+		}
			
 
				 
			
 
				-	if (unknown)
			
 
				-		forced_best = ntasks_best;
			
 
				+		if (unknown)
			
 
				+			forced_best = ntasks_best;
			
 
				 
			
 
				-	double best_fitness = -1;
			
 
				+		double best_fitness = -1;
			
 
				 	
			
 
				-	if (forced_best == -1)
			
 
				-	{
			
 
				+		if (forced_best == -1)
			
 
				+		{
			
 
				 	        for (worker_in_ctx = 0; worker_in_ctx < nworkers_in_ctx; worker_in_ctx++)
			
 
				 	        {
			
 
				 		        worker = sched_ctx->workerid[worker_in_ctx];
			
 
				 
			
 
				-			fifo = dt->queue_array[worker_in_ctx];
			
 
				+				fifo = dt->queue_array[worker_in_ctx];
			
 
				 	
			
 
				-			if (!starpu_worker_may_execute_task(worker, task))
			
 
				+			if (!starpu_worker_may_execute_task(worker, task, 0))
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
 
				 				continue;
			
@@ -546,6 +567,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 		//penality_best = local_data_penalty[best];
			
 
				 	}
			
 
				 
			
 
				+
			
 
				+	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
			
 
				+	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				+
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx);
			
 
				 }
			
@@ -559,18 +584,12 @@ static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id
 
				 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-	if (task->priority > 0)
			
 
				-		return _dm_push_task(task, 1, sched_ctx);
			
 
				-
			
 
				 	return _dm_push_task(task, 0, sched_ctx);
			
 
				 }
			
 
				 
			
 
				 static int dmda_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-	if (task->priority > 0)
			
 
				-		return _dmda_push_task(task, 1, sched_ctx);
			
 
				-
			
 
				 	return _dmda_push_task(task, 0, sched_ctx);
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/deque_queues.c
+++ b/src/sched_policies/deque_queues.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -27,7 +28,7 @@
 
				 struct starpu_deque_jobq_s *_starpu_create_deque(void)
			
 
				 {
			
 
				 	struct starpu_deque_jobq_s *deque;
			
 
				-	deque = malloc(sizeof(struct starpu_deque_jobq_s));
			
 
				+	deque = (struct starpu_deque_jobq_s *) malloc(sizeof(struct starpu_deque_jobq_s));
			
 
				 
			
 
				 	/* note that not all mechanisms (eg. the semaphore) have to be used */
			
 
				 	deque->jobq = starpu_job_list_new();
			
@@ -109,7 +110,11 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_
 
				 		{
			
 
				 			next_job = starpu_job_list_next(i);
			
 
				 
			
 
				-			if (starpu_worker_may_execute_task(workerid, i->task))
			
 
				+			/* In case there are multiples implementations of the
			
 
				+ 			 * codelet for a single device, We dont really care
			
 
				+			 * about the implementation used, so let's try the 
			
 
				+			 * first one. */
			
 
				+			if (starpu_worker_may_execute_task(workerid, i->task, 0))
			
 
				 			{
			
 
				 				/* this elements can be moved into the new list */
			
 
				 				new_list_size++;
			
--- a/src/sched_policies/detect_combined_workers.c
+++ b/src/sched_policies/detect_combined_workers.c
@@ -62,7 +62,7 @@ static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array,
 
				 
			
 
				 	int worker_array_rec[STARPU_NMAXWORKERS];
			
 
				 	int worker_cnt_rec = 0;
			
 
				-	memset(worker_array_rec, 0, sizeof(worker_array_rec));
			
 
				+	memset(worker_array_rec, 0, sizeof(int)*STARPU_NMAXWORKERS);
			
 
				 
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < obj->arity; i++)
			
@@ -169,16 +169,15 @@ static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 
				 
			
 
				 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
			
 
				 {
			
 
				-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				 
			
 
				-	if (config->user_conf && config->user_conf->single_combined_worker)
			
 
				-		combine_all_cpu_workers(topology);
			
 
				-	else {
			
 
				+    if (config->user_conf && config->user_conf->single_combined_worker > 0 || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
			
 
				+	combine_all_cpu_workers(topology);
			
 
				+    else {
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				-		find_combinations_with_hwloc(topology);
			
 
				-		//find_combinations_without_hwloc(topology);
			
 
				+	find_combinations_with_hwloc(topology);
			
 
				 #else
			
 
				-		find_combinations_without_hwloc(topology);
			
 
				+	find_combinations_without_hwloc(topology);
			
 
				 #endif
			
 
				-	}
			
 
				+    }
			
 
				 }
			
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -48,7 +48,7 @@ static struct starpu_priority_taskq_s *_starpu_create_priority_taskq(void)
 
				 {
			
 
				 	struct starpu_priority_taskq_s *central_queue;
			
 
				 	
			
 
				-	central_queue = malloc(sizeof(struct starpu_priority_taskq_s));
			
 
				+	central_queue = (struct starpu_priority_taskq_s *) malloc(sizeof(struct starpu_priority_taskq_s));
			
 
				 	central_queue->total_ntasks = 0;
			
 
				 
			
 
				 	unsigned prio;
			
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -27,7 +28,7 @@
 
				 struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
			
 
				 {
			
 
				 	struct starpu_fifo_taskq_s *fifo;
			
 
				-	fifo = malloc(sizeof(struct starpu_fifo_taskq_s));
			
 
				+	fifo = (struct starpu_fifo_taskq_s *) malloc(sizeof(struct starpu_fifo_taskq_s));
			
 
				 
			
 
				 	/* note that not all mechanisms (eg. the semaphore) have to be used */
			
 
				 	starpu_task_list_init(&fifo->taskq);
			
@@ -111,7 +112,7 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 
				 		{
			
 
				 			next_task = task->next;
			
 
				 
			
 
				-			if (starpu_worker_may_execute_task(workerid, task))
			
 
				+			if (starpu_worker_may_execute_task(workerid, task, 0))
			
 
				 			{
			
 
				 				/* this elements can be moved into the new list */
			
 
				 				new_list_size++;
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -170,7 +171,9 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid, unsign
 
				 	struct starpu_worker_s *worker = _starpu_get_worker_struct(workerid);
			
 
				 	/* Compute the expected penality */
			
 
				 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				-	double predicted = starpu_task_expected_length(task, perf_arch);
			
 
				+
			
 
				+	double predicted = starpu_task_expected_length(task, perf_arch,
			
 
				+			_starpu_get_job_associated_to_task(task)->nimpl);
			
 
				 
			
 
				 	/* Update the predictions */
			
 
				 	PTHREAD_MUTEX_LOCK(worker->sched_mutex);
			
@@ -242,74 +245,86 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				   unsigned worker, worker_in_ctx;
			
 
				   for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
			
 
				     {
			
 
				-      worker = sched_ctx->workerid[worker_in_ctx];
			
 
				-      /* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-      exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
			
 
				-      exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker];
			
 
				-      if (exp_end[worker_in_ctx] > max_exp_end)
			
 
				- 		max_exp_end = exp_end[worker_in_ctx];
			
 
				-
			
 
				-      if (!starpu_worker_may_execute_task(worker, task))
			
 
				-	{
			
 
				-	  /* no one on that queue may execute this task */
			
 
				-	  continue;
			
 
				+		worker = sched_ctx->workerid[worker_in_ctx];
			
 
				+		for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) 
			
 
				+		{
			
 
				+      		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+      		exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
			
 
				+      		exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker];
			
 
				+      		if (exp_end[worker_in_ctx] > max_exp_end)
			
 
				+ 				max_exp_end = exp_end[worker_in_ctx];
			
 
				+
			
 
				+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				+			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				+
			
 
				+      		if (bundle)
			
 
				+      		{
			
 
				+      			local_task_length[worker_in_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
			
 
				+      	  		local_data_penalty[worker_in_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				+      	  		local_power[worker_in_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
			
 
				+				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_in_ctx],local_power[worker_in_ctx],worker,nimpl);
			
 
				+      		}
			
 
				+      		else 
			
 
				+			{
			
 
				+				local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				+				local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+				local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
			
 
				+				//_STARPU_DEBUG("Scheduler heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_in_ctx],local_power[worker_in_ctx],worker,nimpl);
			
 
				+      		}
			
 
				+
			
 
				+      		double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				+
			
 
				+      		if (ntasks_best == -1
			
 
				+	  			|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+	  			|| (!calibrating && local_task_length[worker_in_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				+	  			|| (calibrating && local_task_length[worker_in_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+	  		) {
			
 
				+				ntasks_best_end = ntasks_end;
			
 
				+				ntasks_best = worker;
			
 
				+      		}
			
 
				+
			
 
				+     		if (local_task_length[worker_in_ctx] == -1.0)
			
 
				+				/* we are calibrating, we want to speed-up calibration time
			
 
				+	 			* so we privilege non-calibrated tasks (but still
			
 
				+	 			* greedily distribute them to avoid dumb schedules) */
			
 
				+				calibrating = 1;
			
 
				+
			
 
				+      		if (local_task_length[worker_in_ctx] <= 0.0)
			
 
				+				/* there is no prediction available for that task
			
 
				+	 			* with that arch yet, so switch to a greedy strategy */
			
 
				+				unknown = 1;
			
 
				+
			
 
				+			if (unknown)
			
 
				+				continue;
			
 
				+
			
 
				+      		exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_in_ctx];
			
 
				+
			
 
				+      		if (exp_end[worker_in_ctx] < best_exp_end)
			
 
				+			{
			
 
				+	  			/* a better solution was found */
			
 
				+	  			best_exp_end = exp_end[worker_in_ctx];
			
 
				+				best_impl = nimpl;
			
 
				+			}
			
 
				+
			
 
				+      		if (local_power[worker_in_ctx] == -1.0)
			
 
				+				local_power[worker_in_ctx] = 0.;
			
 
				+    	}
			
 
				 	}
			
 
				 
			
 
				-      enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-      unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-
			
 
				-      if (bundle)
			
 
				-      	{
			
 
				-      	  local_task_length[worker_in_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch);
			
 
				-      	  local_data_penalty[worker_in_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				-      	  local_power[worker_in_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch);
			
 
				-      	}
			
 
				-      else {
			
 
				-	local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
			
 
				-	local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				-	local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch);
			
 
				-      }
			
 
				-
			
 
				-      double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-
			
 
				-      if (ntasks_best == -1
			
 
				-	  || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-	  || (!calibrating && local_task_length[worker_in_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-	  || (calibrating && local_task_length[worker_in_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-	  ) {
			
 
				-	ntasks_best_end = ntasks_end;
			
 
				-	ntasks_best = worker;
			
 
				-      }
			
 
				-
			
 
				-      if (local_task_length[worker_in_ctx] == -1.0)
			
 
				-	/* we are calibrating, we want to speed-up calibration time
			
 
				-	 * so we privilege non-calibrated tasks (but still
			
 
				-	 * greedily distribute them to avoid dumb schedules) */
			
 
				-	calibrating = 1;
			
 
				-
			
 
				-      if (local_task_length[worker_in_ctx] <= 0.0)
			
 
				-	/* there is no prediction available for that task
			
 
				-	 * with that arch yet, so switch to a greedy strategy */
			
 
				-	unknown = 1;
			
 
				-
			
 
				-      if (unknown)
			
 
				-	continue;
			
 
				-
			
 
				-      exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_in_ctx];
			
 
				-
			
 
				-      if (exp_end[worker_in_ctx] < best_exp_end)
			
 
				-	{
			
 
				-	  /* a better solution was found */
			
 
				-	  best_exp_end = exp_end[worker_in_ctx];
			
 
				-	}
			
 
				+	*forced_best = unknown?ntasks_best:-1;
			
 
				 
			
 
				-      if (local_power[worker_in_ctx] == -1.0)
			
 
				-	local_power[worker_in_ctx] = 0.;
			
 
				-    }
			
 
				+	*best_exp_endp = best_exp_end;
			
 
				+	*max_exp_endp = max_exp_end;
			
 
				 
			
 
				-  *forced_best = unknown?ntasks_best:-1;
			
 
				-  *best_exp_endp = best_exp_end;
			
 
				-  *max_exp_endp = max_exp_end;
			
 
				+	/* save the best implementation */
			
 
				+	//_STARPU_DEBUG("Scheduler heft: kernel (%u)\n", best_impl);
			
 
				+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				 }
			
 
				 
			
 
				 static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
			
@@ -364,7 +379,7 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	{
			
 
				 		worker = sched_ctx->workerid[worker_in_ctx];
			
 
				 
			
 
				-		if (!starpu_worker_may_execute_task(worker, task))
			
 
				+		if (!starpu_worker_may_execute_task(worker, task, 0))
			
 
				 		{
			
 
				 			/* no one on that queue may execute this task */
			
 
				 			continue;
			
@@ -400,7 +415,8 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 		/* If we have a task bundle, we have computed the expected
			
 
				 		 * length for the entire bundle, but not for the task alone. */
			
 
				 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
			
 
				-		model_best = starpu_task_expected_length(task, perf_arch);
			
 
				+		model_best = starpu_task_expected_length(task, perf_arch,
			
 
				+				_starpu_get_job_associated_to_task(task)->nimpl);
			
 
				 
			
 
				 		/* Remove the task from the bundle since we have made a
			
 
				 		 * decision for it, and that other tasks should not consider it
			
--- a/src/sched_policies/parallel_greedy.c
+++ b/src/sched_policies/parallel_greedy.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -98,15 +99,15 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
				 	PTHREAD_COND_INIT(&sched_cond, NULL);
			
 
				 
			
 
				 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
			
 
				-	  {
			
 
				-                workerid = sched_ctx->workerid[workerid_ctx];
			
 
				+	{
			
 
				+      	workerid = sched_ctx->workerid[workerid_ctx];
			
 
				 
			
 
				 		PTHREAD_MUTEX_INIT(&master_sched_mutex[workerid], NULL);
			
 
				 		PTHREAD_COND_INIT(&master_sched_cond[workerid], NULL);
			
 
				 	}
			
 
				 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
			
 
				-          {
			
 
				-	        workerid = sched_ctx->workerid[workerid_ctx];
			
 
				+    {
			
 
				+		workerid = sched_ctx->workerid[workerid_ctx];
			
 
				 
			
 
				 		/* slaves pick up tasks from their local queue, their master
			
 
				 		 * will put tasks directly in that local list when a parallel
			
@@ -130,8 +131,8 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
				 
			
 
				 #if 0
			
 
				 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
			
 
				-          {
			
 
				-                workerid = sched_ctx->workerid[workerid_ctx];
			
 
				+	{
			
 
				+        workerid = sched_ctx->workerid[workerid_ctx];
			
 
				 
			
 
				 		fprintf(stderr, "MASTER of %d = %d\n", workerid, master_id[workerid]);
			
 
				 	}
			
@@ -178,7 +179,7 @@ static struct starpu_task *pop_task_pgreedy_policy(void)
 
				 			if (possible_combinations_size[workerid][i] > best_size)
			
 
				 			{
			
 
				 				int combined_worker = possible_combinations[workerid][i];
			
 
				-				if (starpu_combined_worker_may_execute_task(combined_worker, task))
			
 
				+				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
			
 
				 				{
			
 
				 					best_size = possible_combinations_size[workerid][i];
			
 
				 					best_workerid = combined_worker;
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -225,62 +226,73 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 			max_exp_end = worker_exp_end[worker];
			
 
				 	}
			
 
				 
			
 
				+	unsigned nimpl;
			
 
				+	unsigned best_impl = 0;
			
 
				 	for (worker = 0; worker < (nworkers+ncombinedworkers); worker++)
			
 
				 	{
			
 
				-		if (!starpu_combined_worker_may_execute_task(worker, task))
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
 
				-			/* no one on that queue may execute this task */
			
 
				-			skip_worker[worker] = 1;
			
 
				-			continue;
			
 
				-		}
			
 
				-		else {
			
 
				-			skip_worker[worker] = 0;
			
 
				-		}
			
 
				+			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				skip_worker[worker] = 1;
			
 
				+				continue;
			
 
				+			}
			
 
				+			else {
			
 
				+				skip_worker[worker] = 0;
			
 
				+			}
			
 
				 
			
 
				-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-		local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
			
 
				+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 
			
 
				-		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-		local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+			local_task_length[worker] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				 
			
 
				-		double ntasks_end = compute_ntasks_end(worker);
			
 
				+			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				+			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				-		if (ntasks_best == -1
			
 
				-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-				) {
			
 
				-			ntasks_best_end = ntasks_end;
			
 
				-			ntasks_best = worker;
			
 
				-		}
			
 
				+			double ntasks_end = compute_ntasks_end(worker);
			
 
				 
			
 
				-		if (local_task_length[worker] == -1.0)
			
 
				-			/* we are calibrating, we want to speed-up calibration time
			
 
				-			 * so we privilege non-calibrated tasks (but still
			
 
				-			 * greedily distribute them to avoid dumb schedules) */
			
 
				-			calibrating = 1;
			
 
				+			if (ntasks_best == -1
			
 
				+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+					) {
			
 
				+				ntasks_best_end = ntasks_end;
			
 
				+				ntasks_best = worker;
			
 
				+			}
			
 
				 
			
 
				-		if (local_task_length[worker] <= 0.0)
			
 
				-			/* there is no prediction available for that task
			
 
				-			 * with that arch yet, so switch to a greedy strategy */
			
 
				-			unknown = 1;
			
 
				+			if (local_task_length[worker] == -1.0)
			
 
				+				/* we are calibrating, we want to speed-up calibration time
			
 
				+				 * so we privilege non-calibrated tasks (but still
			
 
				+				 * greedily distribute them to avoid dumb schedules) */
			
 
				+				calibrating = 1;
			
 
				 
			
 
				-		if (unknown)
			
 
				-			continue;
			
 
				+			if (local_task_length[worker] <= 0.0)
			
 
				+				/* there is no prediction available for that task
			
 
				+				 * with that arch yet, so switch to a greedy strategy */
			
 
				+				unknown = 1;
			
 
				 
			
 
				-		local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
			
 
				+			if (unknown)
			
 
				+				continue;
			
 
				 
			
 
				-		//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
			
 
				+			local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
			
 
				 
			
 
				-		if (local_exp_end[worker] < best_exp_end)
			
 
				-		{
			
 
				-			/* a better solution was found */
			
 
				-			best_exp_end = local_exp_end[worker];
			
 
				-		}
			
 
				+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
			
 
				+
			
 
				+			if (local_exp_end[worker] < best_exp_end)
			
 
				+			{
			
 
				+				/* a better solution was found */
			
 
				+				best_exp_end = local_exp_end[worker];
			
 
				+				best_impl = nimpl;
			
 
				+			}
			
 
				 
			
 
				-		local_power[worker] = starpu_task_expected_power(task, perf_arch);
			
 
				-		if (local_power[worker] == -1.0)
			
 
				-			local_power[worker] = 0.;
			
 
				+
			
 
				+			local_power[worker] = starpu_task_expected_power(task, perf_arch,nimpl);
			
 
				+			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
			
 
				+
			
 
				+			if (local_power[worker] == -1.0)
			
 
				+				local_power[worker] = 0.;
			
 
				+
			
 
				+		} //end for
			
 
				 	}
			
 
				 
			
 
				 	if (unknown)
			
@@ -338,6 +350,9 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
				 		best_exp_end = local_exp_end[best];
			
 
				 	}
			
 
				 
			
 
				+
			
 
				+	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", best_impl);
			
 
				+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, best_exp_end, prio);
			
 
				 }
			
--- a/src/sched_policies/random_policy.c
+++ b/src/sched_policies/random_policy.c
@@ -45,7 +45,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio, struct sta
 
				 	double alpha = 0.0;
			
 
				 	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
			
 
				 	{
			
 
				-                worker = sched_ctx->workerid[worker_in_ctx];
			
 
				+        worker = sched_ctx->workerid[worker_in_ctx];
			
 
				 
			
 
				 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 		double worker_alpha = starpu_worker_get_relative_speedup(perf_arch);
			
--- a/src/sched_policies/stack_queues.c
+++ b/src/sched_policies/stack_queues.c
@@ -34,7 +34,7 @@ void _starpu_init_stack_queues_mechanisms(void)
 
				 struct starpu_stack_jobq_s *_starpu_create_stack(void)
			
 
				 {
			
 
				 	struct starpu_stack_jobq_s *stack;
			
 
				-	stack = malloc(sizeof(struct starpu_stack_jobq_s));
			
 
				+	stack = (struct starpu_stack_jobq_s *) malloc(sizeof(struct starpu_stack_jobq_s));
			
 
				 
			
 
				 	stack->jobq = starpu_job_list_new();
			
 
				 	stack->njobs = 0;