14 years ago · bb184c7ae9
--- a/simple_ex/Makefile
+++ b/simple_ex/Makefile
@@ -1,29 +0,0 @@
 
																-PROG=exemple
															
 
																-CUDA_SDK_ROOT=/usr/local/cuda/sdk/C
															
 
																-CUDA_HOME=/usr/local/cuda
															
 
																-
															
 
																-.PHONY: all
															
 
																-
															
 
																-all: $(PROG)
															
 
																-
															
 
																-
															
 
																-CC      := gcc
															
 
																-NVCC    := /usr/local/cuda/bin/nvcc
															
 
																-CFLAGS  := $$(pkg-config --cflags libstarpu) -g #-Wall
															
 
																-LDFLAGS := $$(pkg-config --libs libstarpu)
															
 
																-CUDADIR=$(CUDA_HOME)
															
 
																-
															
 
																-COMMONFLAGS += -I. -I$(CUDADIR)/include  -I$(CUDA_SDK_ROOT)/common/inc -DUNIX -g -Xcompiler
															
 
																-NVCCFLAGS +=  -I$(CUDA_SDK_ROOT)/common/inc -I. -G
															
 
																-
															
 
																-%.o: %.cu
															
 
																-	$(NVCC) $(CFLAGS) $(COMMONFLAGS) $(NVCCFLAGS) -o $@ -c $< 
															
 
																-
															
 
																-%.o: %.c
															
 
																-	$(CC) $(CFLAGS) -o $@ -c $< 
															
 
																-
															
 
																-$(PROG): %: %.o %_kernel.o
															
 
																-	$(CC) -o $@ $^ $(LDFLAGS) -L$(CUDADIR)/lib -lcudart
															
 
																-
															
 
																-clean:
															
 
																-	rm -f $(PROG) *.o
															
--- a/simple_ex/exemple.c
+++ b/simple_ex/exemple.c
@@ -1,160 +0,0 @@
 
																-#include <stdio.h>
															
 
																-#include <malloc.h>
															
 
																-#include <starpu.h>
															
 
																-
															
 
																-static inline void my_codelet_cpu(void *descr[], void *_args)
															
 
																-{
															
 
																-  unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-  float *sub = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-
															
 
																-  unsigned i;
															
 
																-
															
 
																-  for(i = 0; i < nx; i++){
															
 
																-    sub[i] *= 5;
															
 
																-  }
															
 
																-}
															
 
																-
															
 
																-extern void my_codelet_gpu(void *descr[], __attribute__ ((unused)) void *_args);
															
 
																-
															
 
																-
															
 
																-static starpu_codelet cl = 
															
 
																-  {
															
 
																-    .where = STARPU_CPU|STARPU_CUDA,
															
 
																-    .cpu_func = my_codelet_cpu,
															
 
																-    .cuda_func = my_codelet_gpu,
															
 
																-    .nbuffers = 1
															
 
																-  };
															
 
																-
															
 
																-void print_vect(int *vect, int size){
															
 
																-  unsigned i;
															
 
																-  for(i = 0; i < size; i++)
															
 
																-    printf("%d ", vect[i]);
															
 
																-  printf("\n");
															
 
																-  
															
 
																-}
															
 
																-
															
 
																-int main(int argc, char **argv)
															
 
																-{
															
 
																-  srand(time(NULL));
															
 
																-  int *mat;
															
 
																-  unsigned size = 20, children = 5;
															
 
																-  mat = (int *)malloc(size*sizeof(int));
															
 
																-
															
 
																-  unsigned i;
															
 
																-  for(i = 0; i < size; i++)
															
 
																-    {
															
 
																-      mat[i] = random()% 10 + 1;
															
 
																-    }
															
 
																-
															
 
																-  print_vect(mat, size);
															
 
																-
															
 
																-  //  struct starpu_conf conf;
															
 
																-  //conf.sched_policy_name = "heft-tm";
															
 
																-  //conf.ncpus = -1;
															
 
																-  // printf("got here \n");
															
 
																-  starpu_init(NULL);
															
 
																-
															
 
																-  for(i = 0; i < 12; i++)
															
 
																-    printf("%d: arch is %d\n", starpu_worker_get_type(i));
															
 
																-
															
 
																-  starpu_data_handle dataA;
															
 
																-  starpu_vector_data_register(&dataA, 0, (uintptr_t)mat, size, sizeof(mat[00]));
															
 
																-
															
 
																-  struct starpu_data_filter f =
															
 
																-    {
															
 
																-      .filter_func = starpu_block_filter_func_vector,
															
 
																-      .nchildren = children,
															
 
																-      .get_nchildren = NULL,
															
 
																-      .get_child_ops = NULL
															
 
																-    };
															
 
																-  starpu_data_partition(dataA, &f);
															
 
																-
															
 
																-  struct starpu_sched_ctx sched_ctx;
															
 
																-  int procs[] = {1, 2, 3};
															
 
																-  starpu_create_sched_ctx(&sched_ctx, "heft", procs, 3);
															
 
																-
															
 
																-  unsigned j;
															
 
																-  for(j = 0; j < children; j++){
															
 
																-    struct starpu_task *task = starpu_task_create();
															
 
																-    task->cl = &cl;
															
 
																-    task->synchronous = 1;
															
 
																-    task->callback_func = NULL;
															
 
																-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
															
 
																-    task->buffers[0].mode = STARPU_RW;
															
 
																-    task->name = "first 1 2 3";  
															
 
																-    starpu_task_submit_to_ctx(task, &sched_ctx);
															
 
																-  }
															
 
																-
															
 
																-  int procs_to_remove[]={1,3};
															
 
																-  starpu_remove_workers_from_sched_ctx(procs_to_remove, 2, &sched_ctx);
															
 
																-
															
 
																-  printf("procs removed \n");
															
 
																-
															
 
																-  for(j = 0; j < children; j++){
															
 
																-    struct starpu_task *task = starpu_task_create();
															
 
																-    task->cl = &cl;
															
 
																-    task->synchronous = 1;
															
 
																-    task->callback_func = NULL;
															
 
																-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
															
 
																-    task->buffers[0].mode = STARPU_RW;
															
 
																-    task->name = "first 2";  
															
 
																-    starpu_task_submit_to_ctx(task, &sched_ctx);
															
 
																-  }
															
 
																-
															
 
																-  int procs_to_add[]={1, 4, 5};
															
 
																-  starpu_add_workers_to_sched_ctx(procs_to_add, 2, &sched_ctx);
															
 
																-
															
 
																-  printf("procs add \n");
															
 
																-
															
 
																-  for(j = 0; j < children; j++){
															
 
																-    struct starpu_task *task = starpu_task_create();
															
 
																-    task->cl = &cl;
															
 
																-    task->synchronous = 1;
															
 
																-    task->callback_func = NULL;
															
 
																-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
															
 
																-    task->buffers[0].mode = STARPU_RW;
															
 
																-    task->name = "first 1 2 4 5";  
															
 
																-    starpu_task_submit_to_ctx(task, &sched_ctx);
															
 
																-  }
															
 
																-
															
 
																-
															
 
																-  struct starpu_sched_ctx sched_ctx2;
															
 
																-  int procs2[]={3, 4, 5, 6, 7};
															
 
																-  starpu_create_sched_ctx(&sched_ctx2, "random", procs2, 5);
															
 
																-
															
 
																-  for(j = 0; j < children; j++){
															
 
																-    struct starpu_task *task3 = starpu_task_create();
															
 
																-    task3->cl = &cl;
															
 
																-    task3->synchronous = 1;
															
 
																-    task3->callback_func = NULL;
															
 
																-    task3->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
															
 
																-    task3->buffers[0].mode = STARPU_RW;
															
 
																-    task3->name = "third 3 4 5 6 7";
															
 
																-    starpu_task_submit_to_ctx(task3, &sched_ctx2);
															
 
																-  }
															
 
																-
															
 
																-  for(j = 0; j < children; j++){
															
 
																-    struct starpu_task *task2 = starpu_task_create();
															
 
																-    task2->cl = &cl;
															
 
																-    task2->synchronous = 1;
															
 
																-    task2->callback_func = NULL;
															
 
																-    task2->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
															
 
																-    task2->buffers[0].mode = STARPU_RW;
															
 
																-    task2->name = "anything";
															
 
																-    starpu_task_submit(task2);
															
 
																-  }
															
 
																-  
															
 
																-  printf("wait for all \n");
															
 
																-  starpu_task_wait_for_all();
															
 
																-  starpu_data_unpartition(dataA, 0);
															
 
																-
															
 
																-  printf("data unregister  \n");
															
 
																-  starpu_data_unregister(dataA);
															
 
																-  
															
 
																-  printf("the end \n");
															
 
																-  starpu_shutdown();
															
 
																-
															
 
																-  print_vect(mat, size);
															
 
																-  
															
 
																-  return 0;
															
 
																-}
															
--- a/simple_ex/exemple_kernel.cu
+++ b/simple_ex/exemple_kernel.cu
@@ -1,22 +0,0 @@
 
																-#include <starpu.h>
															
 
																-#include <starpu_cuda.h>
															
 
																-
															
 
																-static __global__ void myf(int *dMatA)
															
 
																-{
															
 
																-  int tidy = threadIdx.y;
															
 
																-
															
 
																-  dMatA[ tidy ] = dMatA[ tidy ]  * 5;
															
 
																-}
															
 
																-
															
 
																-extern "C" void my_codelet_gpu(void *descr[], void *_args)
															
 
																-{
															
 
																-  unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-  int *sub = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-
															
 
																-  dim3 dimGrid(1,1);
															
 
																-  dim3 dimBlock(nx,nx);
															
 
																-
															
 
																-  myf<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(sub);
															
 
																- 
															
 
																-  cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																-}
															
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -328,59 +329,67 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_
 
																 	/* A priori, we know all estimations */
															
 
																 	int unknown = 0;
															
 
																+	unsigned best_impl = 0;
															
 
																+	unsigned nimpl;
															
 
																 	unsigned nworkers = sched_ctx->nworkers_in_ctx;
															
 
																 	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
															
 
																 	{
															
 
																-        worker = sched_ctx->workerid[worker_in_ctx];
															
 
																-		double exp_end;
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+		{
															
 
																+        	worker = sched_ctx->workerid[worker_in_ctx];
															
 
																+			double exp_end;
															
 
																-		fifo = dt->queue_array[worker_in_ctx];
															
 
																+			fifo = dt->queue_array[worker_in_ctx];
															
 
																-		/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
															
 
																-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																+			/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
															
 
																+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																-		if (!starpu_worker_may_execute_task(worker, task))
															
 
																-		{
															
 
																-			/* no one on that queue may execute this task */
															
 
																-			continue;
															
 
																-		}
															
 
																+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			{
															
 
																+				/* no one on that queue may execute this task */
															
 
																+				continue;
															
 
																+			}
															
 
																-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																-		double local_length = starpu_task_expected_length(task, perf_arch);
															
 
																-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
															
 
																-
															
 
																-		if (ntasks_best == -1
															
 
																-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-				|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-				|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																-				) {
															
 
																-			ntasks_best_end = ntasks_end;
															
 
																-			ntasks_best = worker;
															
 
																-		}
															
 
																+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																+			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
															
 
																-		if (local_length == -1.0)
															
 
																-			/* we are calibrating, we want to speed-up calibration time
															
 
																-			 * so we privilege non-calibrated tasks (but still
															
 
																-			 * greedily distribute them to avoid dumb schedules) */
															
 
																-			calibrating = 1;
															
 
																+			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
															
 
																-		if (local_length <= 0.0)
															
 
																-			/* there is no prediction available for that task
															
 
																-			 * with that arch yet, so switch to a greedy strategy */
															
 
																-			unknown = 1;
															
 
																+			if (ntasks_best == -1
															
 
																+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																+					|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																+					|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+					) {
															
 
																+				ntasks_best_end = ntasks_end;
															
 
																+				ntasks_best = worker;
															
 
																+			}
															
 
																-		if (unknown)
															
 
																-			continue;
															
 
																+			if (local_length == -1.0)
															
 
																+				/* we are calibrating, we want to speed-up calibration time
															
 
																+				 * so we privilege non-calibrated tasks (but still
															
 
																+				 * greedily distribute them to avoid dumb schedules) */
															
 
																+				calibrating = 1;
															
 
																-		exp_end = fifo->exp_start + fifo->exp_len + local_length;
															
 
																+			if (local_length <= 0.0)
															
 
																+				/* there is no prediction available for that task
															
 
																+				 * with that arch yet, so switch to a greedy strategy */
															
 
																+				unknown = 1;
															
 
																-		if (best == -1 || exp_end < best_exp_end)
															
 
																-		{
															
 
																-			/* a better solution was found */
															
 
																-			best_exp_end = exp_end;
															
 
																-			best = worker;
															
 
																-			model_best = local_length;
															
 
																+			if (unknown)
															
 
																+				continue;
															
 
																+
															
 
																+			exp_end = fifo->exp_start + fifo->exp_len + local_length;
															
 
																+
															
 
																+			if (best == -1 || exp_end < best_exp_end)
															
 
																+			{
															
 
																+				/* a better solution was found */
															
 
																+				best_exp_end = exp_end;
															
 
																+				best = worker;
															
 
																+				model_best = local_length;
															
 
																+				best_impl = nimpl;
															
 
																+			}
															
 
																 		}
															
 
																 	}
															
@@ -390,6 +399,11 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_
 
																 	}
															
 
																 	_starpu_increment_nsubmitted_tasks_of_worker(best);
															
 
																+
															
 
																+	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
															
 
																+
															
 
																+	 _starpu_get_job_associated_to_task(task)->nimpl = 0;//best_impl;
															
 
																+
															
 
																 	/* we should now have the best worker in variable "best" */
															
 
																 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx);
															
 
																 }
															
@@ -426,82 +440,89 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
																 	/* A priori, we know all estimations */
															
 
																 	int unknown = 0;
															
 
																+	unsigned best_impl = 0;
															
 
																+	unsigned nimpl=0;
															
 
																 	for (worker_in_ctx = 0; worker_in_ctx < nworkers_in_ctx; worker_in_ctx++)
															
 
																 	{
															
 
																         worker = sched_ctx->workerid[worker_in_ctx];
															
 
																+		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+	 	{
															
 
																+			fifo = dt->queue_array[worker_in_ctx];
															
 
																-		fifo = dt->queue_array[worker_in_ctx];
															
 
																+			/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
															
 
																+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																+			if (fifo->exp_end > max_exp_end)
															
 
																+				max_exp_end = fifo->exp_end;
															
 
																-		/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
															
 
																-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																-		if (fifo->exp_end > max_exp_end)
															
 
																-			max_exp_end = fifo->exp_end;
															
 
																+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			{
															
 
																+				/* no one on that queue may execute this task */
															
 
																+				continue;
															
 
																+			}
															
 
																-		if (!starpu_worker_may_execute_task(worker, task))
															
 
																-		{
															
 
																-			/* no one on that queue may execute this task */
															
 
																-			continue;
															
 
																-		}
															
 
																+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																+			local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
															
 
																-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																-		local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
															
 
																+			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
															
 
																-		unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																-		local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																+			unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																+			local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
															
 
																+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
															
 
																-		if (ntasks_best == -1
															
 
																-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																-				) {
															
 
																-			ntasks_best_end = ntasks_end;
															
 
																-			ntasks_best = worker;
															
 
																-		}
															
 
																+			if (ntasks_best == -1
															
 
																+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+					) {
															
 
																+				ntasks_best_end = ntasks_end;
															
 
																+				ntasks_best = worker;
															
 
																+			}
															
 
																-		if (local_task_length[worker_in_ctx] == -1.0)
															
 
																-			/* we are calibrating, we want to speed-up calibration time
															
 
																-			 * so we privilege non-calibrated tasks (but still
															
 
																-			 * greedily distribute them to avoid dumb schedules) */
															
 
																-			calibrating = 1;
															
 
																+			if (local_task_length[worker_in_ctx] == -1.0)
															
 
																+				/* we are calibrating, we want to speed-up calibration time
															
 
																+			 	* so we privilege non-calibrated tasks (but still
															
 
																+			 	* greedily distribute them to avoid dumb schedules) */
															
 
																+				calibrating = 1;
															
 
																-		if (local_task_length[worker_in_ctx] <= 0.0)
															
 
																-			/* there is no prediction available for that task
															
 
																-			 * with that arch yet, so switch to a greedy strategy */
															
 
																-			unknown = 1;
															
 
																+			if (local_task_length[worker_in_ctx] <= 0.0)
															
 
																+				/* there is no prediction available for that task
															
 
																+			 	* with that arch yet, so switch to a greedy strategy */
															
 
																+				unknown = 1;
															
 
																-		if (unknown)
															
 
																-			continue;
															
 
																+			if (unknown)
															
 
																+				continue;
															
 
																-		exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
															
 
																+			exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
															
 
																-		if (exp_end[worker_in_ctx] < best_exp_end)
															
 
																-		{
															
 
																-			/* a better solution was found */
															
 
																-			best_exp_end = exp_end[worker_in_ctx];
															
 
																-		}
															
 
																+			if (exp_end[worker_in_ctx] < best_exp_end)
															
 
																+			{
															
 
																+				/* a better solution was found */
															
 
																+				best_exp_end = exp_end[worker_in_ctx];
															
 
																+				best_impl = nimpl;
															
 
																+			}
															
 
																-		local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch);
															
 
																-		if (local_power[worker_in_ctx] == -1.0)
															
 
																-			local_power[worker_in_ctx] = 0.;
															
 
																-	}
															
 
																+			local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
															
 
																+			if (local_power[worker_in_ctx] == -1.0)
															
 
																+				local_power[worker_in_ctx] = 0.;
															
 
																+			}	
															
 
																+		}
															
 
																-	if (unknown)
															
 
																-		forced_best = ntasks_best;
															
 
																+		if (unknown)
															
 
																+			forced_best = ntasks_best;
															
 
																-	double best_fitness = -1;
															
 
																+		double best_fitness = -1;
															
 
																-	if (forced_best == -1)
															
 
																-	{
															
 
																+		if (forced_best == -1)
															
 
																+		{
															
 
																 	        for (worker_in_ctx = 0; worker_in_ctx < nworkers_in_ctx; worker_in_ctx++)
															
 
																 	        {
															
 
																 		        worker = sched_ctx->workerid[worker_in_ctx];
															
 
																-			fifo = dt->queue_array[worker_in_ctx];
															
 
																+				fifo = dt->queue_array[worker_in_ctx];
															
 
																-			if (!starpu_worker_may_execute_task(worker, task))
															
 
																+			if (!starpu_worker_may_execute_task(worker, task, 0))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
@@ -546,6 +567,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
																 		//penality_best = local_data_penalty[best];
															
 
																 	}
															
 
																+
															
 
																+	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
															
 
																+	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
															
 
																+
															
 
																 	/* we should now have the best worker in variable "best" */
															
 
																 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx);
															
 
																 }
															
@@ -559,18 +584,12 @@ static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id
 
																 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
															
 
																 {
															
 
																 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
															
 
																-	if (task->priority > 0)
															
 
																-		return _dm_push_task(task, 1, sched_ctx);
															
 
																-
															
 
																 	return _dm_push_task(task, 0, sched_ctx);
															
 
																 }
															
 
																 static int dmda_push_task(struct starpu_task *task, unsigned sched_ctx_id)
															
 
																 {
															
 
																 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
															
 
																-	if (task->priority > 0)
															
 
																-		return _dmda_push_task(task, 1, sched_ctx);
															
 
																-
															
 
																 	return _dmda_push_task(task, 0, sched_ctx);
															
 
																 }
															
--- a/src/sched_policies/deque_queues.c
+++ b/src/sched_policies/deque_queues.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -27,7 +28,7 @@
 
																 struct starpu_deque_jobq_s *_starpu_create_deque(void)
															
 
																 {
															
 
																 	struct starpu_deque_jobq_s *deque;
															
 
																-	deque = malloc(sizeof(struct starpu_deque_jobq_s));
															
 
																+	deque = (struct starpu_deque_jobq_s *) malloc(sizeof(struct starpu_deque_jobq_s));
															
 
																 	/* note that not all mechanisms (eg. the semaphore) have to be used */
															
 
																 	deque->jobq = starpu_job_list_new();
															
@@ -109,7 +110,11 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_
 
																 		{
															
 
																 			next_job = starpu_job_list_next(i);
															
 
																-			if (starpu_worker_may_execute_task(workerid, i->task))
															
 
																+			/* In case there are multiples implementations of the
															
 
																+ 			 * codelet for a single device, We dont really care
															
 
																+			 * about the implementation used, so let's try the 
															
 
																+			 * first one. */
															
 
																+			if (starpu_worker_may_execute_task(workerid, i->task, 0))
															
 
																 			{
															
 
																 				/* this elements can be moved into the new list */
															
 
																 				new_list_size++;
															
--- a/src/sched_policies/detect_combined_workers.c
+++ b/src/sched_policies/detect_combined_workers.c
@@ -62,7 +62,7 @@ static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array,
 
																 	int worker_array_rec[STARPU_NMAXWORKERS];
															
 
																 	int worker_cnt_rec = 0;
															
 
																-	memset(worker_array_rec, 0, sizeof(worker_array_rec));
															
 
																+	memset(worker_array_rec, 0, sizeof(int)*STARPU_NMAXWORKERS);
															
 
																 	unsigned i;
															
 
																 	for (i = 0; i < obj->arity; i++)
															
@@ -169,16 +169,15 @@ static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 
																 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
															
 
																 {
															
 
																-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																-	if (config->user_conf && config->user_conf->single_combined_worker)
															
 
																-		combine_all_cpu_workers(topology);
															
 
																-	else {
															
 
																+    if (config->user_conf && config->user_conf->single_combined_worker > 0 || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
															
 
																+	combine_all_cpu_workers(topology);
															
 
																+    else {
															
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																-		find_combinations_with_hwloc(topology);
															
 
																-		//find_combinations_without_hwloc(topology);
															
 
																+	find_combinations_with_hwloc(topology);
															
 
																 #else
															
 
																-		find_combinations_without_hwloc(topology);
															
 
																+	find_combinations_without_hwloc(topology);
															
 
																 #endif
															
 
																-	}
															
 
																+    }
															
 
																 }
															
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -48,7 +48,7 @@ static struct starpu_priority_taskq_s *_starpu_create_priority_taskq(void)
 
																 {
															
 
																 	struct starpu_priority_taskq_s *central_queue;
															
 
																-	central_queue = malloc(sizeof(struct starpu_priority_taskq_s));
															
 
																+	central_queue = (struct starpu_priority_taskq_s *) malloc(sizeof(struct starpu_priority_taskq_s));
															
 
																 	central_queue->total_ntasks = 0;
															
 
																 	unsigned prio;
															
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -27,7 +28,7 @@
 
																 struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
															
 
																 {
															
 
																 	struct starpu_fifo_taskq_s *fifo;
															
 
																-	fifo = malloc(sizeof(struct starpu_fifo_taskq_s));
															
 
																+	fifo = (struct starpu_fifo_taskq_s *) malloc(sizeof(struct starpu_fifo_taskq_s));
															
 
																 	/* note that not all mechanisms (eg. the semaphore) have to be used */
															
 
																 	starpu_task_list_init(&fifo->taskq);
															
@@ -111,7 +112,7 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 
																 		{
															
 
																 			next_task = task->next;
															
 
																-			if (starpu_worker_may_execute_task(workerid, task))
															
 
																+			if (starpu_worker_may_execute_task(workerid, task, 0))
															
 
																 			{
															
 
																 				/* this elements can be moved into the new list */
															
 
																 				new_list_size++;
															
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -170,7 +171,9 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid, unsign
 
																 	struct starpu_worker_s *worker = _starpu_get_worker_struct(workerid);
															
 
																 	/* Compute the expected penality */
															
 
																 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
															
 
																-	double predicted = starpu_task_expected_length(task, perf_arch);
															
 
																+
															
 
																+	double predicted = starpu_task_expected_length(task, perf_arch,
															
 
																+			_starpu_get_job_associated_to_task(task)->nimpl);
															
 
																 	/* Update the predictions */
															
 
																 	PTHREAD_MUTEX_LOCK(worker->sched_mutex);
															
@@ -242,74 +245,86 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
																   unsigned worker, worker_in_ctx;
															
 
																   for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
															
 
																     {
															
 
																-      worker = sched_ctx->workerid[worker_in_ctx];
															
 
																-      /* Sometimes workers didn't take the tasks as early as we expected */
															
 
																-      exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
															
 
																-      exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker];
															
 
																-      if (exp_end[worker_in_ctx] > max_exp_end)
															
 
																- 		max_exp_end = exp_end[worker_in_ctx];
															
 
																-
															
 
																-      if (!starpu_worker_may_execute_task(worker, task))
															
 
																-	{
															
 
																-	  /* no one on that queue may execute this task */
															
 
																-	  continue;
															
 
																+		worker = sched_ctx->workerid[worker_in_ctx];
															
 
																+		for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) 
															
 
																+		{
															
 
																+      		/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																+      		exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
															
 
																+      		exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker];
															
 
																+      		if (exp_end[worker_in_ctx] > max_exp_end)
															
 
																+ 				max_exp_end = exp_end[worker_in_ctx];
															
 
																+
															
 
																+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			{
															
 
																+				/* no one on that queue may execute this task */
															
 
																+				continue;
															
 
																+			}
															
 
																+
															
 
																+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																+			unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																+
															
 
																+      		if (bundle)
															
 
																+      		{
															
 
																+      			local_task_length[worker_in_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
															
 
																+      	  		local_data_penalty[worker_in_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
															
 
																+      	  		local_power[worker_in_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
															
 
																+				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_in_ctx],local_power[worker_in_ctx],worker,nimpl);
															
 
																+      		}
															
 
																+      		else 
															
 
																+			{
															
 
																+				local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																+				local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																+				local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
															
 
																+				//_STARPU_DEBUG("Scheduler heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_in_ctx],local_power[worker_in_ctx],worker,nimpl);
															
 
																+      		}
															
 
																+
															
 
																+      		double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
															
 
																+
															
 
																+      		if (ntasks_best == -1
															
 
																+	  			|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																+	  			|| (!calibrating && local_task_length[worker_in_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																+	  			|| (calibrating && local_task_length[worker_in_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+	  		) {
															
 
																+				ntasks_best_end = ntasks_end;
															
 
																+				ntasks_best = worker;
															
 
																+      		}
															
 
																+
															
 
																+     		if (local_task_length[worker_in_ctx] == -1.0)
															
 
																+				/* we are calibrating, we want to speed-up calibration time
															
 
																+	 			* so we privilege non-calibrated tasks (but still
															
 
																+	 			* greedily distribute them to avoid dumb schedules) */
															
 
																+				calibrating = 1;
															
 
																+
															
 
																+      		if (local_task_length[worker_in_ctx] <= 0.0)
															
 
																+				/* there is no prediction available for that task
															
 
																+	 			* with that arch yet, so switch to a greedy strategy */
															
 
																+				unknown = 1;
															
 
																+
															
 
																+			if (unknown)
															
 
																+				continue;
															
 
																+
															
 
																+      		exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_in_ctx];
															
 
																+
															
 
																+      		if (exp_end[worker_in_ctx] < best_exp_end)
															
 
																+			{
															
 
																+	  			/* a better solution was found */
															
 
																+	  			best_exp_end = exp_end[worker_in_ctx];
															
 
																+				best_impl = nimpl;
															
 
																+			}
															
 
																+
															
 
																+      		if (local_power[worker_in_ctx] == -1.0)
															
 
																+				local_power[worker_in_ctx] = 0.;
															
 
																+    	}
															
 
																 	}
															
 
																-      enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																-      unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																-
															
 
																-      if (bundle)
															
 
																-      	{
															
 
																-      	  local_task_length[worker_in_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch);
															
 
																-      	  local_data_penalty[worker_in_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
															
 
																-      	  local_power[worker_in_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch);
															
 
																-      	}
															
 
																-      else {
															
 
																-	local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
															
 
																-	local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																-	local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch);
															
 
																-      }
															
 
																-
															
 
																-      double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
															
 
																-
															
 
																-      if (ntasks_best == -1
															
 
																-	  || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-	  || (!calibrating && local_task_length[worker_in_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-	  || (calibrating && local_task_length[worker_in_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																-	  ) {
															
 
																-	ntasks_best_end = ntasks_end;
															
 
																-	ntasks_best = worker;
															
 
																-      }
															
 
																-
															
 
																-      if (local_task_length[worker_in_ctx] == -1.0)
															
 
																-	/* we are calibrating, we want to speed-up calibration time
															
 
																-	 * so we privilege non-calibrated tasks (but still
															
 
																-	 * greedily distribute them to avoid dumb schedules) */
															
 
																-	calibrating = 1;
															
 
																-
															
 
																-      if (local_task_length[worker_in_ctx] <= 0.0)
															
 
																-	/* there is no prediction available for that task
															
 
																-	 * with that arch yet, so switch to a greedy strategy */
															
 
																-	unknown = 1;
															
 
																-
															
 
																-      if (unknown)
															
 
																-	continue;
															
 
																-
															
 
																-      exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_in_ctx];
															
 
																-
															
 
																-      if (exp_end[worker_in_ctx] < best_exp_end)
															
 
																-	{
															
 
																-	  /* a better solution was found */
															
 
																-	  best_exp_end = exp_end[worker_in_ctx];
															
 
																-	}
															
 
																+	*forced_best = unknown?ntasks_best:-1;
															
 
																-      if (local_power[worker_in_ctx] == -1.0)
															
 
																-	local_power[worker_in_ctx] = 0.;
															
 
																-    }
															
 
																+	*best_exp_endp = best_exp_end;
															
 
																+	*max_exp_endp = max_exp_end;
															
 
																-  *forced_best = unknown?ntasks_best:-1;
															
 
																-  *best_exp_endp = best_exp_end;
															
 
																-  *max_exp_endp = max_exp_end;
															
 
																+	/* save the best implementation */
															
 
																+	//_STARPU_DEBUG("Scheduler heft: kernel (%u)\n", best_impl);
															
 
																+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
															
 
																 }
															
 
																 static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
															
@@ -364,7 +379,7 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 	{
															
 
																 		worker = sched_ctx->workerid[worker_in_ctx];
															
 
																-		if (!starpu_worker_may_execute_task(worker, task))
															
 
																+		if (!starpu_worker_may_execute_task(worker, task, 0))
															
 
																 		{
															
 
																 			/* no one on that queue may execute this task */
															
 
																 			continue;
															
@@ -400,7 +415,8 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 		/* If we have a task bundle, we have computed the expected
															
 
																 		 * length for the entire bundle, but not for the task alone. */
															
 
																 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
															
 
																-		model_best = starpu_task_expected_length(task, perf_arch);
															
 
																+		model_best = starpu_task_expected_length(task, perf_arch,
															
 
																+				_starpu_get_job_associated_to_task(task)->nimpl);
															
 
																 		/* Remove the task from the bundle since we have made a
															
 
																 		 * decision for it, and that other tasks should not consider it
															
--- a/src/sched_policies/parallel_greedy.c
+++ b/src/sched_policies/parallel_greedy.c
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -98,15 +99,15 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
																 	PTHREAD_COND_INIT(&sched_cond, NULL);
															
 
																 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
															
 
																-	  {
															
 
																-                workerid = sched_ctx->workerid[workerid_ctx];
															
 
																+	{
															
 
																+      	workerid = sched_ctx->workerid[workerid_ctx];
															
 
																 		PTHREAD_MUTEX_INIT(&master_sched_mutex[workerid], NULL);
															
 
																 		PTHREAD_COND_INIT(&master_sched_cond[workerid], NULL);
															
 
																 	}
															
 
																 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
															
 
																-          {
															
 
																-	        workerid = sched_ctx->workerid[workerid_ctx];
															
 
																+    {
															
 
																+		workerid = sched_ctx->workerid[workerid_ctx];
															
 
																 		/* slaves pick up tasks from their local queue, their master
															
 
																 		 * will put tasks directly in that local list when a parallel
															
@@ -130,8 +131,8 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
																 #if 0
															
 
																 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
															
 
																-          {
															
 
																-                workerid = sched_ctx->workerid[workerid_ctx];
															
 
																+	{
															
 
																+        workerid = sched_ctx->workerid[workerid_ctx];
															
 
																 		fprintf(stderr, "MASTER of %d = %d\n", workerid, master_id[workerid]);
															
 
																 	}
															
@@ -178,7 +179,7 @@ static struct starpu_task *pop_task_pgreedy_policy(void)
 
																 			if (possible_combinations_size[workerid][i] > best_size)
															
 
																 			{
															
 
																 				int combined_worker = possible_combinations[workerid][i];
															
 
																-				if (starpu_combined_worker_may_execute_task(combined_worker, task))
															
 
																+				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
															
 
																 				{
															
 
																 					best_size = possible_combinations_size[workerid][i];
															
 
																 					best_workerid = combined_worker;
															
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -225,62 +226,73 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
																 			max_exp_end = worker_exp_end[worker];
															
 
																 	}
															
 
																+	unsigned nimpl;
															
 
																+	unsigned best_impl = 0;
															
 
																 	for (worker = 0; worker < (nworkers+ncombinedworkers); worker++)
															
 
																 	{
															
 
																-		if (!starpu_combined_worker_may_execute_task(worker, task))
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 		{
															
 
																-			/* no one on that queue may execute this task */
															
 
																-			skip_worker[worker] = 1;
															
 
																-			continue;
															
 
																-		}
															
 
																-		else {
															
 
																-			skip_worker[worker] = 0;
															
 
																-		}
															
 
																+			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
															
 
																+			{
															
 
																+				/* no one on that queue may execute this task */
															
 
																+				skip_worker[worker] = 1;
															
 
																+				continue;
															
 
																+			}
															
 
																+			else {
															
 
																+				skip_worker[worker] = 0;
															
 
																+			}
															
 
																-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																-		local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
															
 
																+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																-		unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																-		local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																+			local_task_length[worker] = starpu_task_expected_length(task, perf_arch,nimpl);
															
 
																-		double ntasks_end = compute_ntasks_end(worker);
															
 
																+			unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																+			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																-		if (ntasks_best == -1
															
 
																-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																-				) {
															
 
																-			ntasks_best_end = ntasks_end;
															
 
																-			ntasks_best = worker;
															
 
																-		}
															
 
																+			double ntasks_end = compute_ntasks_end(worker);
															
 
																-		if (local_task_length[worker] == -1.0)
															
 
																-			/* we are calibrating, we want to speed-up calibration time
															
 
																-			 * so we privilege non-calibrated tasks (but still
															
 
																-			 * greedily distribute them to avoid dumb schedules) */
															
 
																-			calibrating = 1;
															
 
																+			if (ntasks_best == -1
															
 
																+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+					) {
															
 
																+				ntasks_best_end = ntasks_end;
															
 
																+				ntasks_best = worker;
															
 
																+			}
															
 
																-		if (local_task_length[worker] <= 0.0)
															
 
																-			/* there is no prediction available for that task
															
 
																-			 * with that arch yet, so switch to a greedy strategy */
															
 
																-			unknown = 1;
															
 
																+			if (local_task_length[worker] == -1.0)
															
 
																+				/* we are calibrating, we want to speed-up calibration time
															
 
																+				 * so we privilege non-calibrated tasks (but still
															
 
																+				 * greedily distribute them to avoid dumb schedules) */
															
 
																+				calibrating = 1;
															
 
																-		if (unknown)
															
 
																-			continue;
															
 
																+			if (local_task_length[worker] <= 0.0)
															
 
																+				/* there is no prediction available for that task
															
 
																+				 * with that arch yet, so switch to a greedy strategy */
															
 
																+				unknown = 1;
															
 
																-		local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
															
 
																+			if (unknown)
															
 
																+				continue;
															
 
																-		//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
															
 
																+			local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
															
 
																-		if (local_exp_end[worker] < best_exp_end)
															
 
																-		{
															
 
																-			/* a better solution was found */
															
 
																-			best_exp_end = local_exp_end[worker];
															
 
																-		}
															
 
																+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
															
 
																+
															
 
																+			if (local_exp_end[worker] < best_exp_end)
															
 
																+			{
															
 
																+				/* a better solution was found */
															
 
																+				best_exp_end = local_exp_end[worker];
															
 
																+				best_impl = nimpl;
															
 
																+			}
															
 
																-		local_power[worker] = starpu_task_expected_power(task, perf_arch);
															
 
																-		if (local_power[worker] == -1.0)
															
 
																-			local_power[worker] = 0.;
															
 
																+
															
 
																+			local_power[worker] = starpu_task_expected_power(task, perf_arch,nimpl);
															
 
																+			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
															
 
																+
															
 
																+			if (local_power[worker] == -1.0)
															
 
																+				local_power[worker] = 0.;
															
 
																+
															
 
																+		} //end for
															
 
																 	}
															
 
																 	if (unknown)
															
@@ -338,6 +350,9 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 
																 		best_exp_end = local_exp_end[best];
															
 
																 	}
															
 
																+
															
 
																+	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", best_impl);
															
 
																+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
															
 
																 	/* we should now have the best worker in variable "best" */
															
 
																 	return push_task_on_best_worker(task, best, best_exp_end, prio);
															
 
																 }
															
--- a/src/sched_policies/random_policy.c
+++ b/src/sched_policies/random_policy.c
@@ -45,7 +45,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio, struct sta
 
																 	double alpha = 0.0;
															
 
																 	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
															
 
																 	{
															
 
																-                worker = sched_ctx->workerid[worker_in_ctx];
															
 
																+        worker = sched_ctx->workerid[worker_in_ctx];
															
 
																 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																 		double worker_alpha = starpu_worker_get_relative_speedup(perf_arch);
															
--- a/src/sched_policies/stack_queues.c
+++ b/src/sched_policies/stack_queues.c
@@ -34,7 +34,7 @@ void _starpu_init_stack_queues_mechanisms(void)
 
																 struct starpu_stack_jobq_s *_starpu_create_stack(void)
															
 
																 {
															
 
																 	struct starpu_stack_jobq_s *stack;
															
 
																-	stack = malloc(sizeof(struct starpu_stack_jobq_s));
															
 
																+	stack = (struct starpu_stack_jobq_s *) malloc(sizeof(struct starpu_stack_jobq_s));
															
 
																 	stack->jobq = starpu_job_list_new();
															
 
																 	stack->njobs = 0;