瀏覽代碼

merge with sched_policies also

Andra Hugo 14 年之前
父節點
當前提交
bb184c7ae9

+ 0 - 29
simple_ex/Makefile

@@ -1,29 +0,0 @@
-PROG=exemple
-CUDA_SDK_ROOT=/usr/local/cuda/sdk/C
-CUDA_HOME=/usr/local/cuda
-
-.PHONY: all
-
-all: $(PROG)
-
-
-CC      := gcc
-NVCC    := /usr/local/cuda/bin/nvcc
-CFLAGS  := $$(pkg-config --cflags libstarpu) -g #-Wall
-LDFLAGS := $$(pkg-config --libs libstarpu)
-CUDADIR=$(CUDA_HOME)
-
-COMMONFLAGS += -I. -I$(CUDADIR)/include  -I$(CUDA_SDK_ROOT)/common/inc -DUNIX -g -Xcompiler
-NVCCFLAGS +=  -I$(CUDA_SDK_ROOT)/common/inc -I. -G
-
-%.o: %.cu
-	$(NVCC) $(CFLAGS) $(COMMONFLAGS) $(NVCCFLAGS) -o $@ -c $< 
-
-%.o: %.c
-	$(CC) $(CFLAGS) -o $@ -c $< 
-
-$(PROG): %: %.o %_kernel.o
-	$(CC) -o $@ $^ $(LDFLAGS) -L$(CUDADIR)/lib -lcudart
-
-clean:
-	rm -f $(PROG) *.o

+ 0 - 160
simple_ex/exemple.c

@@ -1,160 +0,0 @@
-#include <stdio.h>
-#include <malloc.h>
-#include <starpu.h>
-
-static inline void my_codelet_cpu(void *descr[], void *_args)
-{
-  unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
-  float *sub = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
-
-  unsigned i;
-
-  for(i = 0; i < nx; i++){
-    sub[i] *= 5;
-  }
-}
-
-extern void my_codelet_gpu(void *descr[], __attribute__ ((unused)) void *_args);
-
-
-static starpu_codelet cl = 
-  {
-    .where = STARPU_CPU|STARPU_CUDA,
-    .cpu_func = my_codelet_cpu,
-    .cuda_func = my_codelet_gpu,
-    .nbuffers = 1
-  };
-
-void print_vect(int *vect, int size){
-  unsigned i;
-  for(i = 0; i < size; i++)
-    printf("%d ", vect[i]);
-  printf("\n");
-  
-}
-
-int main(int argc, char **argv)
-{
-  srand(time(NULL));
-  int *mat;
-  unsigned size = 20, children = 5;
-  mat = (int *)malloc(size*sizeof(int));
-
-  unsigned i;
-  for(i = 0; i < size; i++)
-    {
-      mat[i] = random()% 10 + 1;
-    }
-
-  print_vect(mat, size);
-
-  //  struct starpu_conf conf;
-  //conf.sched_policy_name = "heft-tm";
-  //conf.ncpus = -1;
-  // printf("got here \n");
-  starpu_init(NULL);
-
-  for(i = 0; i < 12; i++)
-    printf("%d: arch is %d\n", starpu_worker_get_type(i));
-
-  starpu_data_handle dataA;
-  starpu_vector_data_register(&dataA, 0, (uintptr_t)mat, size, sizeof(mat[00]));
-
-  struct starpu_data_filter f =
-    {
-      .filter_func = starpu_block_filter_func_vector,
-      .nchildren = children,
-      .get_nchildren = NULL,
-      .get_child_ops = NULL
-    };
-  starpu_data_partition(dataA, &f);
-
-  struct starpu_sched_ctx sched_ctx;
-  int procs[] = {1, 2, 3};
-  starpu_create_sched_ctx(&sched_ctx, "heft", procs, 3);
-
-  unsigned j;
-  for(j = 0; j < children; j++){
-    struct starpu_task *task = starpu_task_create();
-    task->cl = &cl;
-    task->synchronous = 1;
-    task->callback_func = NULL;
-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
-    task->buffers[0].mode = STARPU_RW;
-    task->name = "first 1 2 3";  
-    starpu_task_submit_to_ctx(task, &sched_ctx);
-  }
-
-  int procs_to_remove[]={1,3};
-  starpu_remove_workers_from_sched_ctx(procs_to_remove, 2, &sched_ctx);
-
-  printf("procs removed \n");
-
-  for(j = 0; j < children; j++){
-    struct starpu_task *task = starpu_task_create();
-    task->cl = &cl;
-    task->synchronous = 1;
-    task->callback_func = NULL;
-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
-    task->buffers[0].mode = STARPU_RW;
-    task->name = "first 2";  
-    starpu_task_submit_to_ctx(task, &sched_ctx);
-  }
-
-  int procs_to_add[]={1, 4, 5};
-  starpu_add_workers_to_sched_ctx(procs_to_add, 2, &sched_ctx);
-
-  printf("procs add \n");
-
-  for(j = 0; j < children; j++){
-    struct starpu_task *task = starpu_task_create();
-    task->cl = &cl;
-    task->synchronous = 1;
-    task->callback_func = NULL;
-    task->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
-    task->buffers[0].mode = STARPU_RW;
-    task->name = "first 1 2 4 5";  
-    starpu_task_submit_to_ctx(task, &sched_ctx);
-  }
-
-
-  struct starpu_sched_ctx sched_ctx2;
-  int procs2[]={3, 4, 5, 6, 7};
-  starpu_create_sched_ctx(&sched_ctx2, "random", procs2, 5);
-
-  for(j = 0; j < children; j++){
-    struct starpu_task *task3 = starpu_task_create();
-    task3->cl = &cl;
-    task3->synchronous = 1;
-    task3->callback_func = NULL;
-    task3->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
-    task3->buffers[0].mode = STARPU_RW;
-    task3->name = "third 3 4 5 6 7";
-    starpu_task_submit_to_ctx(task3, &sched_ctx2);
-  }
-
-  for(j = 0; j < children; j++){
-    struct starpu_task *task2 = starpu_task_create();
-    task2->cl = &cl;
-    task2->synchronous = 1;
-    task2->callback_func = NULL;
-    task2->buffers[0].handle = starpu_data_get_sub_data(dataA, 1, j);
-    task2->buffers[0].mode = STARPU_RW;
-    task2->name = "anything";
-    starpu_task_submit(task2);
-  }
-  
-  printf("wait for all \n");
-  starpu_task_wait_for_all();
-  starpu_data_unpartition(dataA, 0);
-
-  printf("data unregister  \n");
-  starpu_data_unregister(dataA);
-  
-  printf("the end \n");
-  starpu_shutdown();
-
-  print_vect(mat, size);
-  
-  return 0;
-}

+ 0 - 22
simple_ex/exemple_kernel.cu

@@ -1,22 +0,0 @@
-#include <starpu.h>
-#include <starpu_cuda.h>
-
-static __global__ void myf(int *dMatA)
-{
-  int tidy = threadIdx.y;
-
-  dMatA[ tidy ] = dMatA[ tidy ]  * 5;
-}
-
-extern "C" void my_codelet_gpu(void *descr[], void *_args)
-{
-  unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
-  int *sub = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
-
-  dim3 dimGrid(1,1);
-  dim3 dimBlock(nx,nx);
-
-  myf<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(sub);
- 
-  cudaStreamSynchronize(starpu_cuda_get_local_stream());
-}

+ 118 - 99
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -328,59 +329,67 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_
 	/* A priori, we know all estimations */
 	int unknown = 0;
 
+	unsigned best_impl = 0;
+	unsigned nimpl;
 	unsigned nworkers = sched_ctx->nworkers_in_ctx;
 	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
 	{
-        worker = sched_ctx->workerid[worker_in_ctx];
-		double exp_end;
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		{
+        	worker = sched_ctx->workerid[worker_in_ctx];
+			double exp_end;
 		
-		fifo = dt->queue_array[worker_in_ctx];
+			fifo = dt->queue_array[worker_in_ctx];
 
-		/* Sometimes workers didn't take the tasks as early as we expected */
-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
+			/* Sometimes workers didn't take the tasks as early as we expected */
+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
-		if (!starpu_worker_may_execute_task(worker, task))
-		{
-			/* no one on that queue may execute this task */
-			continue;
-		}
+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			{
+				/* no one on that queue may execute this task */
+				continue;
+			}
 
-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-		double local_length = starpu_task_expected_length(task, perf_arch);
-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
-
-		if (ntasks_best == -1
-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-				|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
-				|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-				) {
-			ntasks_best_end = ntasks_end;
-			ntasks_best = worker;
-		}
+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
-		if (local_length == -1.0)
-			/* we are calibrating, we want to speed-up calibration time
-			 * so we privilege non-calibrated tasks (but still
-			 * greedily distribute them to avoid dumb schedules) */
-			calibrating = 1;
+			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
 
-		if (local_length <= 0.0)
-			/* there is no prediction available for that task
-			 * with that arch yet, so switch to a greedy strategy */
-			unknown = 1;
+			if (ntasks_best == -1
+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+					|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
+					|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+					) {
+				ntasks_best_end = ntasks_end;
+				ntasks_best = worker;
+			}
 
-		if (unknown)
-			continue;
+			if (local_length == -1.0)
+				/* we are calibrating, we want to speed-up calibration time
+				 * so we privilege non-calibrated tasks (but still
+				 * greedily distribute them to avoid dumb schedules) */
+				calibrating = 1;
 
-		exp_end = fifo->exp_start + fifo->exp_len + local_length;
+			if (local_length <= 0.0)
+				/* there is no prediction available for that task
+				 * with that arch yet, so switch to a greedy strategy */
+				unknown = 1;
 
-		if (best == -1 || exp_end < best_exp_end)
-		{
-			/* a better solution was found */
-			best_exp_end = exp_end;
-			best = worker;
-			model_best = local_length;
+			if (unknown)
+				continue;
+
+			exp_end = fifo->exp_start + fifo->exp_len + local_length;
+
+			if (best == -1 || exp_end < best_exp_end)
+			{
+				/* a better solution was found */
+				best_exp_end = exp_end;
+				best = worker;
+				model_best = local_length;
+				best_impl = nimpl;
+			}
 		}
 	}
 
@@ -390,6 +399,11 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_
 	}
 	
 	_starpu_increment_nsubmitted_tasks_of_worker(best);
+
+	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
+
+	 _starpu_get_job_associated_to_task(task)->nimpl = 0;//best_impl;
+
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx);
 }
@@ -426,82 +440,89 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 	/* A priori, we know all estimations */
 	int unknown = 0;
 
+	unsigned best_impl = 0;
+	unsigned nimpl=0;
 	for (worker_in_ctx = 0; worker_in_ctx < nworkers_in_ctx; worker_in_ctx++)
 	{
         worker = sched_ctx->workerid[worker_in_ctx];
+		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+	 	{
+			fifo = dt->queue_array[worker_in_ctx];
 
-		fifo = dt->queue_array[worker_in_ctx];
+			/* Sometimes workers didn't take the tasks as early as we expected */
+			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
+			if (fifo->exp_end > max_exp_end)
+				max_exp_end = fifo->exp_end;
 
-		/* Sometimes workers didn't take the tasks as early as we expected */
-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
-		if (fifo->exp_end > max_exp_end)
-			max_exp_end = fifo->exp_end;
+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			{
+				/* no one on that queue may execute this task */
+				continue;
+			}
 
-		if (!starpu_worker_may_execute_task(worker, task))
-		{
-			/* no one on that queue may execute this task */
-			continue;
-		}
+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+			local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
 
-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-		local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
+			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
 
-		unsigned memory_node = starpu_worker_get_memory_node(worker);
-		local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
+			unsigned memory_node = starpu_worker_get_memory_node(worker);
+			local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
 
-		double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
+			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
-		if (ntasks_best == -1
-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-				) {
-			ntasks_best_end = ntasks_end;
-			ntasks_best = worker;
-		}
+			if (ntasks_best == -1
+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+					) {
+				ntasks_best_end = ntasks_end;
+				ntasks_best = worker;
+			}
 
-		if (local_task_length[worker_in_ctx] == -1.0)
-			/* we are calibrating, we want to speed-up calibration time
-			 * so we privilege non-calibrated tasks (but still
-			 * greedily distribute them to avoid dumb schedules) */
-			calibrating = 1;
+			if (local_task_length[worker_in_ctx] == -1.0)
+				/* we are calibrating, we want to speed-up calibration time
+			 	* so we privilege non-calibrated tasks (but still
+			 	* greedily distribute them to avoid dumb schedules) */
+				calibrating = 1;
 
-		if (local_task_length[worker_in_ctx] <= 0.0)
-			/* there is no prediction available for that task
-			 * with that arch yet, so switch to a greedy strategy */
-			unknown = 1;
+			if (local_task_length[worker_in_ctx] <= 0.0)
+				/* there is no prediction available for that task
+			 	* with that arch yet, so switch to a greedy strategy */
+				unknown = 1;
 
-		if (unknown)
-			continue;
+			if (unknown)
+				continue;
 
-		exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
+			exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
 
-		if (exp_end[worker_in_ctx] < best_exp_end)
-		{
-			/* a better solution was found */
-			best_exp_end = exp_end[worker_in_ctx];
-		}
+			if (exp_end[worker_in_ctx] < best_exp_end)
+			{
+				/* a better solution was found */
+				best_exp_end = exp_end[worker_in_ctx];
+				best_impl = nimpl;
+			}
 
-		local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch);
-		if (local_power[worker_in_ctx] == -1.0)
-			local_power[worker_in_ctx] = 0.;
-	}
+			local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
+			if (local_power[worker_in_ctx] == -1.0)
+				local_power[worker_in_ctx] = 0.;
+			}	
+		}
 
-	if (unknown)
-		forced_best = ntasks_best;
+		if (unknown)
+			forced_best = ntasks_best;
 
-	double best_fitness = -1;
+		double best_fitness = -1;
 	
-	if (forced_best == -1)
-	{
+		if (forced_best == -1)
+		{
 	        for (worker_in_ctx = 0; worker_in_ctx < nworkers_in_ctx; worker_in_ctx++)
 	        {
 		        worker = sched_ctx->workerid[worker_in_ctx];
 
-			fifo = dt->queue_array[worker_in_ctx];
+				fifo = dt->queue_array[worker_in_ctx];
 	
-			if (!starpu_worker_may_execute_task(worker, task))
+			if (!starpu_worker_may_execute_task(worker, task, 0))
 			{
 				/* no one on that queue may execute this task */
 				continue;
@@ -546,6 +567,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 		//penality_best = local_data_penalty[best];
 	}
 
+
+	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
+	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
+
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx);
 }
@@ -559,18 +584,12 @@ static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id
 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-	if (task->priority > 0)
-		return _dm_push_task(task, 1, sched_ctx);
-
 	return _dm_push_task(task, 0, sched_ctx);
 }
 
 static int dmda_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
-	if (task->priority > 0)
-		return _dmda_push_task(task, 1, sched_ctx);
-
 	return _dmda_push_task(task, 0, sched_ctx);
 }
 

+ 7 - 2
src/sched_policies/deque_queues.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -27,7 +28,7 @@
 struct starpu_deque_jobq_s *_starpu_create_deque(void)
 {
 	struct starpu_deque_jobq_s *deque;
-	deque = malloc(sizeof(struct starpu_deque_jobq_s));
+	deque = (struct starpu_deque_jobq_s *) malloc(sizeof(struct starpu_deque_jobq_s));
 
 	/* note that not all mechanisms (eg. the semaphore) have to be used */
 	deque->jobq = starpu_job_list_new();
@@ -109,7 +110,11 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_
 		{
 			next_job = starpu_job_list_next(i);
 
-			if (starpu_worker_may_execute_task(workerid, i->task))
+			/* In case there are multiples implementations of the
+ 			 * codelet for a single device, We dont really care
+			 * about the implementation used, so let's try the 
+			 * first one. */
+			if (starpu_worker_may_execute_task(workerid, i->task, 0))
 			{
 				/* this elements can be moved into the new list */
 				new_list_size++;

+ 8 - 9
src/sched_policies/detect_combined_workers.c

@@ -62,7 +62,7 @@ static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array,
 
 	int worker_array_rec[STARPU_NMAXWORKERS];
 	int worker_cnt_rec = 0;
-	memset(worker_array_rec, 0, sizeof(worker_array_rec));
+	memset(worker_array_rec, 0, sizeof(int)*STARPU_NMAXWORKERS);
 
 	unsigned i;
 	for (i = 0; i < obj->arity; i++)
@@ -169,16 +169,15 @@ static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 
 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
 
-	if (config->user_conf && config->user_conf->single_combined_worker)
-		combine_all_cpu_workers(topology);
-	else {
+    if (config->user_conf && config->user_conf->single_combined_worker > 0 || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
+	combine_all_cpu_workers(topology);
+    else {
 #ifdef STARPU_HAVE_HWLOC
-		find_combinations_with_hwloc(topology);
-		//find_combinations_without_hwloc(topology);
+	find_combinations_with_hwloc(topology);
 #else
-		find_combinations_without_hwloc(topology);
+	find_combinations_without_hwloc(topology);
 #endif
-	}
+    }
 }

+ 1 - 1
src/sched_policies/eager_central_priority_policy.c

@@ -48,7 +48,7 @@ static struct starpu_priority_taskq_s *_starpu_create_priority_taskq(void)
 {
 	struct starpu_priority_taskq_s *central_queue;
 	
-	central_queue = malloc(sizeof(struct starpu_priority_taskq_s));
+	central_queue = (struct starpu_priority_taskq_s *) malloc(sizeof(struct starpu_priority_taskq_s));
 	central_queue->total_ntasks = 0;
 
 	unsigned prio;

+ 3 - 2
src/sched_policies/fifo_queues.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -27,7 +28,7 @@
 struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
 {
 	struct starpu_fifo_taskq_s *fifo;
-	fifo = malloc(sizeof(struct starpu_fifo_taskq_s));
+	fifo = (struct starpu_fifo_taskq_s *) malloc(sizeof(struct starpu_fifo_taskq_s));
 
 	/* note that not all mechanisms (eg. the semaphore) have to be used */
 	starpu_task_list_init(&fifo->taskq);
@@ -111,7 +112,7 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 		{
 			next_task = task->next;
 
-			if (starpu_worker_may_execute_task(workerid, task))
+			if (starpu_worker_may_execute_task(workerid, task, 0))
 			{
 				/* this elements can be moved into the new list */
 				new_list_size++;

+ 83 - 67
src/sched_policies/heft.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -170,7 +171,9 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid, unsign
 	struct starpu_worker_s *worker = _starpu_get_worker_struct(workerid);
 	/* Compute the expected penality */
 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
-	double predicted = starpu_task_expected_length(task, perf_arch);
+
+	double predicted = starpu_task_expected_length(task, perf_arch,
+			_starpu_get_job_associated_to_task(task)->nimpl);
 
 	/* Update the predictions */
 	PTHREAD_MUTEX_LOCK(worker->sched_mutex);
@@ -242,74 +245,86 @@ static void compute_all_performance_predictions(struct starpu_task *task,
   unsigned worker, worker_in_ctx;
   for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
     {
-      worker = sched_ctx->workerid[worker_in_ctx];
-      /* Sometimes workers didn't take the tasks as early as we expected */
-      exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
-      exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker];
-      if (exp_end[worker_in_ctx] > max_exp_end)
- 		max_exp_end = exp_end[worker_in_ctx];
-
-      if (!starpu_worker_may_execute_task(worker, task))
-	{
-	  /* no one on that queue may execute this task */
-	  continue;
+		worker = sched_ctx->workerid[worker_in_ctx];
+		for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) 
+		{
+      		/* Sometimes workers didn't take the tasks as early as we expected */
+      		exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
+      		exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker];
+      		if (exp_end[worker_in_ctx] > max_exp_end)
+ 				max_exp_end = exp_end[worker_in_ctx];
+
+			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			{
+				/* no one on that queue may execute this task */
+				continue;
+			}
+
+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+			unsigned memory_node = starpu_worker_get_memory_node(worker);
+
+      		if (bundle)
+      		{
+      			local_task_length[worker_in_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
+      	  		local_data_penalty[worker_in_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
+      	  		local_power[worker_in_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
+				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_in_ctx],local_power[worker_in_ctx],worker,nimpl);
+      		}
+      		else 
+			{
+				local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
+				local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
+				local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
+				//_STARPU_DEBUG("Scheduler heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_in_ctx],local_power[worker_in_ctx],worker,nimpl);
+      		}
+
+      		double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
+
+      		if (ntasks_best == -1
+	  			|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+	  			|| (!calibrating && local_task_length[worker_in_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
+	  			|| (calibrating && local_task_length[worker_in_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+	  		) {
+				ntasks_best_end = ntasks_end;
+				ntasks_best = worker;
+      		}
+
+     		if (local_task_length[worker_in_ctx] == -1.0)
+				/* we are calibrating, we want to speed-up calibration time
+	 			* so we privilege non-calibrated tasks (but still
+	 			* greedily distribute them to avoid dumb schedules) */
+				calibrating = 1;
+
+      		if (local_task_length[worker_in_ctx] <= 0.0)
+				/* there is no prediction available for that task
+	 			* with that arch yet, so switch to a greedy strategy */
+				unknown = 1;
+
+			if (unknown)
+				continue;
+
+      		exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_in_ctx];
+
+      		if (exp_end[worker_in_ctx] < best_exp_end)
+			{
+	  			/* a better solution was found */
+	  			best_exp_end = exp_end[worker_in_ctx];
+				best_impl = nimpl;
+			}
+
+      		if (local_power[worker_in_ctx] == -1.0)
+				local_power[worker_in_ctx] = 0.;
+    	}
 	}
 
-      enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-      unsigned memory_node = starpu_worker_get_memory_node(worker);
-
-      if (bundle)
-      	{
-      	  local_task_length[worker_in_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch);
-      	  local_data_penalty[worker_in_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
-      	  local_power[worker_in_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch);
-      	}
-      else {
-	local_task_length[worker_in_ctx] = starpu_task_expected_length(task, perf_arch);
-	local_data_penalty[worker_in_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
-	local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch);
-      }
-
-      double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
-
-      if (ntasks_best == -1
-	  || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-	  || (!calibrating && local_task_length[worker_in_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
-	  || (calibrating && local_task_length[worker_in_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-	  ) {
-	ntasks_best_end = ntasks_end;
-	ntasks_best = worker;
-      }
-
-      if (local_task_length[worker_in_ctx] == -1.0)
-	/* we are calibrating, we want to speed-up calibration time
-	 * so we privilege non-calibrated tasks (but still
-	 * greedily distribute them to avoid dumb schedules) */
-	calibrating = 1;
-
-      if (local_task_length[worker_in_ctx] <= 0.0)
-	/* there is no prediction available for that task
-	 * with that arch yet, so switch to a greedy strategy */
-	unknown = 1;
-
-      if (unknown)
-	continue;
-
-      exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_in_ctx];
-
-      if (exp_end[worker_in_ctx] < best_exp_end)
-	{
-	  /* a better solution was found */
-	  best_exp_end = exp_end[worker_in_ctx];
-	}
+	*forced_best = unknown?ntasks_best:-1;
 
-      if (local_power[worker_in_ctx] == -1.0)
-	local_power[worker_in_ctx] = 0.;
-    }
+	*best_exp_endp = best_exp_end;
+	*max_exp_endp = max_exp_end;
 
-  *forced_best = unknown?ntasks_best:-1;
-  *best_exp_endp = best_exp_end;
-  *max_exp_endp = max_exp_end;
+	/* save the best implementation */
+	//_STARPU_DEBUG("Scheduler heft: kernel (%u)\n", best_impl);
+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
 }
 
 static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
@@ -364,7 +379,7 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	{
 		worker = sched_ctx->workerid[worker_in_ctx];
 
-		if (!starpu_worker_may_execute_task(worker, task))
+		if (!starpu_worker_may_execute_task(worker, task, 0))
 		{
 			/* no one on that queue may execute this task */
 			continue;
@@ -400,7 +415,8 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 		/* If we have a task bundle, we have computed the expected
 		 * length for the entire bundle, but not for the task alone. */
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
-		model_best = starpu_task_expected_length(task, perf_arch);
+		model_best = starpu_task_expected_length(task, perf_arch,
+				_starpu_get_job_associated_to_task(task)->nimpl);
 
 		/* Remove the task from the bundle since we have made a
 		 * decision for it, and that other tasks should not consider it

+ 8 - 7
src/sched_policies/parallel_greedy.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -98,15 +99,15 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 	PTHREAD_COND_INIT(&sched_cond, NULL);
 
 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
-	  {
-                workerid = sched_ctx->workerid[workerid_ctx];
+	{
+      	workerid = sched_ctx->workerid[workerid_ctx];
 
 		PTHREAD_MUTEX_INIT(&master_sched_mutex[workerid], NULL);
 		PTHREAD_COND_INIT(&master_sched_cond[workerid], NULL);
 	}
 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
-          {
-	        workerid = sched_ctx->workerid[workerid_ctx];
+    {
+		workerid = sched_ctx->workerid[workerid_ctx];
 
 		/* slaves pick up tasks from their local queue, their master
 		 * will put tasks directly in that local list when a parallel
@@ -130,8 +131,8 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
 #if 0
 	for (workerid_ctx = 0; workerid_ctx < nworkers_in_ctx; workerid_ctx++)
-          {
-                workerid = sched_ctx->workerid[workerid_ctx];
+	{
+        workerid = sched_ctx->workerid[workerid_ctx];
 
 		fprintf(stderr, "MASTER of %d = %d\n", workerid, master_id[workerid]);
 	}
@@ -178,7 +179,7 @@ static struct starpu_task *pop_task_pgreedy_policy(void)
 			if (possible_combinations_size[workerid][i] > best_size)
 			{
 				int combined_worker = possible_combinations[workerid][i];
-				if (starpu_combined_worker_may_execute_task(combined_worker, task))
+				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
 				{
 					best_size = possible_combinations_size[workerid][i];
 					best_workerid = combined_worker;

+ 57 - 42
src/sched_policies/parallel_heft.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -225,62 +226,73 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 			max_exp_end = worker_exp_end[worker];
 	}
 
+	unsigned nimpl;
+	unsigned best_impl = 0;
 	for (worker = 0; worker < (nworkers+ncombinedworkers); worker++)
 	{
-		if (!starpu_combined_worker_may_execute_task(worker, task))
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
-			/* no one on that queue may execute this task */
-			skip_worker[worker] = 1;
-			continue;
-		}
-		else {
-			skip_worker[worker] = 0;
-		}
+			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
+			{
+				/* no one on that queue may execute this task */
+				skip_worker[worker] = 1;
+				continue;
+			}
+			else {
+				skip_worker[worker] = 0;
+			}
 
-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-		local_task_length[worker] = starpu_task_expected_length(task, perf_arch);
+			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 
-		unsigned memory_node = starpu_worker_get_memory_node(worker);
-		local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
+			local_task_length[worker] = starpu_task_expected_length(task, perf_arch,nimpl);
 
-		double ntasks_end = compute_ntasks_end(worker);
+			unsigned memory_node = starpu_worker_get_memory_node(worker);
+			local_data_penalty[worker] = starpu_task_expected_data_transfer_time(memory_node, task);
 
-		if (ntasks_best == -1
-				|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-				|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
-				|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-				) {
-			ntasks_best_end = ntasks_end;
-			ntasks_best = worker;
-		}
+			double ntasks_end = compute_ntasks_end(worker);
 
-		if (local_task_length[worker] == -1.0)
-			/* we are calibrating, we want to speed-up calibration time
-			 * so we privilege non-calibrated tasks (but still
-			 * greedily distribute them to avoid dumb schedules) */
-			calibrating = 1;
+			if (ntasks_best == -1
+					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
+					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+					) {
+				ntasks_best_end = ntasks_end;
+				ntasks_best = worker;
+			}
 
-		if (local_task_length[worker] <= 0.0)
-			/* there is no prediction available for that task
-			 * with that arch yet, so switch to a greedy strategy */
-			unknown = 1;
+			if (local_task_length[worker] == -1.0)
+				/* we are calibrating, we want to speed-up calibration time
+				 * so we privilege non-calibrated tasks (but still
+				 * greedily distribute them to avoid dumb schedules) */
+				calibrating = 1;
 
-		if (unknown)
-			continue;
+			if (local_task_length[worker] <= 0.0)
+				/* there is no prediction available for that task
+				 * with that arch yet, so switch to a greedy strategy */
+				unknown = 1;
 
-		local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
+			if (unknown)
+				continue;
 
-		//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
+			local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
 
-		if (local_exp_end[worker] < best_exp_end)
-		{
-			/* a better solution was found */
-			best_exp_end = local_exp_end[worker];
-		}
+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
+
+			if (local_exp_end[worker] < best_exp_end)
+			{
+				/* a better solution was found */
+				best_exp_end = local_exp_end[worker];
+				best_impl = nimpl;
+			}
 
-		local_power[worker] = starpu_task_expected_power(task, perf_arch);
-		if (local_power[worker] == -1.0)
-			local_power[worker] = 0.;
+
+			local_power[worker] = starpu_task_expected_power(task, perf_arch,nimpl);
+			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
+
+			if (local_power[worker] == -1.0)
+				local_power[worker] = 0.;
+
+		} //end for
 	}
 
 	if (unknown)
@@ -338,6 +350,9 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 		best_exp_end = local_exp_end[best];
 	}
 
+
+	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", best_impl);
+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, best_exp_end, prio);
 }

+ 1 - 1
src/sched_policies/random_policy.c

@@ -45,7 +45,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio, struct sta
 	double alpha = 0.0;
 	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
 	{
-                worker = sched_ctx->workerid[worker_in_ctx];
+        worker = sched_ctx->workerid[worker_in_ctx];
 
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 		double worker_alpha = starpu_worker_get_relative_speedup(perf_arch);

+ 1 - 1
src/sched_policies/stack_queues.c

@@ -34,7 +34,7 @@ void _starpu_init_stack_queues_mechanisms(void)
 struct starpu_stack_jobq_s *_starpu_create_stack(void)
 {
 	struct starpu_stack_jobq_s *stack;
-	stack = malloc(sizeof(struct starpu_stack_jobq_s));
+	stack = (struct starpu_stack_jobq_s *) malloc(sizeof(struct starpu_stack_jobq_s));
 
 	stack->jobq = starpu_job_list_new();
 	stack->njobs = 0;