13 years ago · d3db74a216
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
@@ -26,11 +26,6 @@
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				 #include <starpu_parameters.h>
			
 
				 
			
 
				-/* #ifdef STARPU_VERBOSE */
			
 
				-/* static long int total_task_cnt = 0; */
			
 
				-/* static long int ready_task_cnt = 0; */
			
 
				-/* #endif */
			
 
				-
			
 
				 typedef struct {
			
 
				 	double alpha;
			
 
				 	double beta;
			
@@ -47,19 +42,15 @@ typedef struct {
 
				 static int count_non_ready_buffers(struct starpu_task *task, uint32_t node)
			
 
				 {
			
 
				 	int cnt = 0;
			
 
				-
			
 
				-	starpu_buffer_descr *descrs = task->buffers;
			
 
				 	unsigned nbuffers = task->cl->nbuffers;
			
 
				-
			
 
				 	unsigned index;
			
 
				+
			
 
				 	for (index = 0; index < nbuffers; index++)
			
 
				 	{
			
 
				-		starpu_buffer_descr *descr;
			
 
				-		starpu_data_handle handle;
			
 
				+		starpu_data_handle_t handle;
			
 
				+
			
 
				+		handle = task->handles[index];
			
 
				 
			
 
				-		descr = &descrs[index];
			
 
				-		handle = descr->handle;
			
 
				-		
			
 
				 		int is_valid;
			
 
				 		starpu_data_query_status(handle, node, NULL, &is_valid, NULL);
			
 
				 
			
@@ -70,14 +61,14 @@ static int count_non_ready_buffers(struct starpu_task *task, uint32_t node)
 
				 	return cnt;
			
 
				 }
			
 
				 
			
 
				-static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct starpu_fifo_taskq_s *fifo_queue, unsigned node)
			
 
				+static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned node)
			
 
				 {
			
 
				 	struct starpu_task *task = NULL, *current;
			
 
				 
			
 
				 	if (fifo_queue->ntasks == 0)
			
 
				 		return NULL;
			
 
				 
			
 
				-	if (fifo_queue->ntasks > 0) 
			
 
				+	if (fifo_queue->ntasks > 0)
			
 
				 	{
			
 
				 		fifo_queue->ntasks--;
			
 
				 
			
@@ -108,12 +99,12 @@ static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct starpu_fifo_
 
				 
			
 
				 			current = current->prev;
			
 
				 		}
			
 
				-		
			
 
				+
			
 
				 		starpu_task_list_erase(&fifo_queue->taskq, task);
			
 
				 
			
 
				-		STARPU_TRACE_JOB_POP(task, 0);
			
 
				+		_STARPU_TRACE_JOB_POP(task, 0);
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	return task;
			
 
				 }
			
 
				 
			
@@ -124,15 +115,15 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 
				 	struct starpu_task *task;
			
 
				 
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				-
			
 
				-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
			
 
				+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
			
 
				 
			
 
				 	unsigned node = starpu_worker_get_memory_node(workerid);
			
 
				 
			
 
				 	task = _starpu_fifo_pop_first_ready_task(fifo, node);
			
 
				-	if (task) {
			
 
				+	if (task)
			
 
				+	{
			
 
				 		double model = task->predicted;
			
 
				-	
			
 
				+
			
 
				 		fifo->exp_len -= model;
			
 
				 		fifo->exp_start = starpu_timing_now() + model;
			
 
				 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
@@ -159,12 +150,13 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 
				 	struct starpu_task *task;
			
 
				 
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
			
 
				+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
			
 
				 
			
 
				-	task = _starpu_fifo_pop_task(fifo, -1);
			
 
				-	if (task) {
			
 
				+	task = _starpu_fifo_pop_task(fifo, workerid);
			
 
				+	if (task)
			
 
				+	{
			
 
				 		double model = task->predicted;
			
 
				-	
			
 
				+
			
 
				 		fifo->exp_len -= model;
			
 
				 		fifo->exp_start = starpu_timing_now() + model;
			
 
				 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
@@ -193,7 +185,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
				 	struct starpu_task *new_list;
			
 
				 
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
			
 
				+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
			
 
				 
			
 
				 	pthread_mutex_t *sched_mutex;
			
 
				 	pthread_cond_t *sched_cond;
			
@@ -207,7 +199,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
				 		fifo->exp_len -= model;
			
 
				 		fifo->exp_start = starpu_timing_now() + model;
			
 
				 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				-	
			
 
				+
			
 
				 		new_list = new_list->next;
			
 
				 	}
			
 
				 
			
@@ -215,13 +207,13 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
				 }
			
 
				 
			
 
				 static
			
 
				-int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				+int _starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				 {
			
 
				 	struct starpu_task_list *list = &fifo_queue->taskq;
			
 
				 
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 
			
 
				-	STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				+	_STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				 
			
 
				 	if (list->head == NULL)
			
 
				 	{
			
@@ -230,7 +222,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 
				 		task->prev = NULL;
			
 
				 		task->next = NULL;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		struct starpu_task *current = list->head;
			
 
				 		struct starpu_task *prev = NULL;
			
 
				 
			
@@ -251,7 +244,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 
				 			task->next = list->head;
			
 
				 			list->head = task;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			if (current)
			
 
				 			{
			
 
				 				/* Insert between prev and current */
			
@@ -260,7 +254,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 
				 				task->next = current;
			
 
				 				current->prev = task;
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				/* Insert at the tail of the list */
			
 
				 				list->tail->next = task;
			
 
				 				task->next = NULL;
			
@@ -273,8 +268,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 
				 	fifo_queue->ntasks++;
			
 
				 	fifo_queue->nprocessed++;
			
 
				 
			
 
				-	PTHREAD_COND_SIGNAL(sched_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -287,7 +282,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	/* make sure someone coule execute that task ! */
			
 
				 	STARPU_ASSERT(best_workerid != -1);
			
 
				 
			
 
				-	struct starpu_fifo_taskq_s *fifo;
			
 
				+	struct _starpu_fifo_taskq *fifo;
			
 
				 	fifo = dt->queue_array[best_workerid];
			
 
				 
			
 
				 	fifo->exp_end += predicted;
			
@@ -295,6 +290,8 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 
			
 
				 	task->predicted = predicted;
			
 
				 
			
 
				+	/* TODO predicted_transfer */
			
 
				+
			
 
				 	unsigned memory_node = starpu_worker_get_memory_node(best_workerid);
			
 
				 
			
 
				 	if (starpu_get_prefetch_flag())
			
@@ -311,11 +308,12 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 			sched_mutex, sched_cond, task);
			
 
				 }
			
 
				 
			
 
				+/* TODO: factorize with dmda!! */
			
 
				 static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
			
 
				 	/* find the queue */
			
 
				-	struct starpu_fifo_taskq_s *fifo;
			
 
				+	struct _starpu_fifo_taskq *fifo;
			
 
				 	unsigned worker, worker_ctx;
			
 
				 	int best = -1;
			
 
				 
			
@@ -346,7 +344,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				 			fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
 
				 				continue;
			
@@ -359,21 +357,23 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
			
 
				 
			
 
				 			if (ntasks_best == -1
			
 
				-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-					|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-					|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-					) {
			
 
				+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+			    || (!calibrating && isnan(local_length)) /* Not calibrating but this worker is being calibrated */
			
 
				+			    || (calibrating && isnan(local_length) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+				)
			
 
				+			{
			
 
				 				ntasks_best_end = ntasks_end;
			
 
				 				ntasks_best = worker;
			
 
				+				best_impl = nimpl;
			
 
				 			}
			
 
				 
			
 
				-			if (local_length == -1.0)
			
 
				+			if (isnan(local_length))
			
 
				 				/* we are calibrating, we want to speed-up calibration time
			
 
				 				 * so we privilege non-calibrated tasks (but still
			
 
				 				 * greedily distribute them to avoid dumb schedules) */
			
 
				 				calibrating = 1;
			
 
				 
			
 
				-			if (local_length <= 0.0)
			
 
				+			if (isnan(local_length) || _STARPU_IS_ZERO(local_length))
			
 
				 				/* there is no prediction available for that task
			
 
				 				 * with that arch yet, so switch to a greedy strategy */
			
 
				 				unknown = 1;
			
@@ -394,7 +394,8 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	if (unknown) {
			
 
				+	if (unknown)
			
 
				+	{
			
 
				 		best = ntasks_best;
			
 
				 		model_best = 0.0;
			
 
				 	}
			
@@ -403,7 +404,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 
			
 
				 	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
			
 
				 
			
 
				-	 _starpu_get_job_associated_to_task(task)->nimpl = 0;//best_impl;
			
 
				+	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				 
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
			
@@ -413,7 +414,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 {
			
 
				 	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
			
 
				 	/* find the queue */
			
 
				-	struct starpu_fifo_taskq_s *fifo;
			
 
				+	struct _starpu_fifo_taskq *fifo;
			
 
				 	unsigned worker, worker_ctx;
			
 
				 	int best = -1, best_ctx = -1;
			
 
				 	
			
@@ -423,13 +424,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 
			
 
				 	unsigned nworkers_ctx = starpu_get_nworkers_of_ctx(sched_ctx_id);
			
 
				 	int *workerids = starpu_get_workers_of_ctx(sched_ctx_id);
			
 
				-	double local_task_length[nworkers_ctx];
			
 
				-	double local_data_penalty[nworkers_ctx];
			
 
				-	double local_power[nworkers_ctx];
			
 
				-	double exp_end[nworkers_ctx];
			
 
				+	double local_task_length[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_data_penalty[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_power[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				 	double max_exp_end = 0.0;
			
 
				 
			
 
				-	double fitness[nworkers_ctx];
			
 
				+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	double best_exp_end = 10e240;
			
 
				 	double model_best = 0.0;
			
@@ -438,7 +439,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	int ntasks_best = -1;
			
 
				 	double ntasks_best_end = 0.0;
			
 
				 	int calibrating = 0;
			
 
				-	
			
 
				+
			
 
				 	/* A priori, we know all estimations */
			
 
				 	int unknown = 0;
			
 
				 
			
@@ -457,105 +458,109 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 			if (fifo->exp_end > max_exp_end)
			
 
				 				max_exp_end = fifo->exp_end;
			
 
				 
			
 
				-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
 
				 				continue;
			
 
				 			}
			
 
				 
			
 
				 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				-			local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				+			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				 
			
 
				 			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
			
 
				 
			
 
				 			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-			local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				 			if (ntasks_best == -1
			
 
				-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-					) {
			
 
				+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
			
 
				+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+				)
			
 
				+			{
			
 
				 				ntasks_best_end = ntasks_end;
			
 
				 				ntasks_best = worker;
			
 
				-
			
 
				+				best_impl = nimpl;
			
 
				 			}
			
 
				 
			
 
				-			if (local_task_length[worker_ctx] == -1.0)
			
 
				+			if (isnan(local_task_length[worker_ctx][nimpl]))
			
 
				 				/* we are calibrating, we want to speed-up calibration time
			
 
				-			 	* so we privilege non-calibrated tasks (but still
			
 
				-			 	* greedily distribute them to avoid dumb schedules) */
			
 
				+				 * so we privilege non-calibrated tasks (but still
			
 
				+				 * greedily distribute them to avoid dumb schedules) */
			
 
				 				calibrating = 1;
			
 
				 
			
 
				-			if (local_task_length[worker_ctx] <= 0.0)
			
 
				+			if (isnan(local_task_length[worker_ctx][nimpl])
			
 
				+					|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
			
 
				 				/* there is no prediction available for that task
			
 
				-			 	* with that arch yet, so switch to a greedy strategy */
			
 
				+				 * with that arch yet, so switch to a greedy strategy */
			
 
				 				unknown = 1;
			
 
				 
			
 
				 			if (unknown)
			
 
				 					continue;
			
 
				 
			
 
				-			exp_end[worker_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx];
			
 
				+			exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx][nimpl];
			
 
				 
			
 
				-			if (exp_end[worker_ctx] < best_exp_end)
			
 
				+			if (exp_end[worker_ctx][nimpl] < best_exp_end)
			
 
				 			{
			
 
				 				/* a better solution was found */
			
 
				-				best_exp_end = exp_end[worker_ctx];
			
 
				+				best_exp_end = exp_end[worker_ctx][nimpl];
			
 
				 				best_impl = nimpl;
			
 
				-
			
 
				 			}
			
 
				 
			
 
				-			local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
			
 
				-			if (local_power[worker_ctx] == -1.0)
			
 
				-				local_power[worker_ctx] = 0.;
			
 
				-			}	
			
 
				-		}
			
 
				+			local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch, nimpl);
			
 
				+			if (isnan(local_power[worker_ctx][nimpl]))
			
 
				+				local_power[worker_ctx][nimpl] = 0.;
			
 
				 
			
 
				-		if (unknown)
			
 
				-			forced_best = ntasks_best;
			
 
				+		 }
			
 
				+	}
			
 
				+
			
 
				+	if (unknown)
			
 
				+		forced_best = ntasks_best;
			
 
				 
			
 
				-		double best_fitness = -1;
			
 
				+	double best_fitness = -1;
			
 
				 	
			
 
				-		if (forced_best == -1)
			
 
				+	if (forced_best == -1)
			
 
				+	{
			
 
				+		for (worker_ctx = 0; worker_ctx < nworkers_ctx; worker_ctx++)
			
 
				 		{
			
 
				-	        for (worker_ctx = 0; worker_ctx < nworkers_ctx; worker_ctx++)
			
 
				-	        {
			
 
				-		        worker = workerids[worker_ctx];
			
 
				+			worker = workerids[worker_ctx];
			
 
				+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+			{	
			
 
				+				if (!starpu_worker_can_execute_task(worker, task, nimpl))
			
 
				+				{
			
 
				+					/* no one on that queue may execute this task */
			
 
				+					continue;
			
 
				+				}
			
 
				 
			
 
				-			fifo = dt->queue_array[worker];
			
 
				+					fifo = dt->queue_array[worker];
			
 
				 	
			
 
				-			if (!starpu_worker_may_execute_task(worker, task, 0))
			
 
				-			{
			
 
				-				/* no one on that queue may execute this task */
			
 
				-				continue;
			
 
				-			}
			
 
				-	
			
 
				-			fitness[worker_ctx] = dt->alpha*(exp_end[worker_ctx] - best_exp_end) 
			
 
				-					+ dt->beta*(local_data_penalty[worker_ctx])
			
 
				-					+ dt->_gamma*(local_power[worker_ctx]);
			
 
				+					fitness[worker_ctx][nimpl] = dt->alpha*(exp_end[worker_ctx][nimpl] - best_exp_end) 
			
 
				+					+ dt->beta*(local_data_penalty[worker_ctx][nimpl])
			
 
				+					+ dt->_gamma*(local_power[worker_ctx][nimpl]);
			
 
				 
			
 
				-			if (exp_end[worker_ctx] > max_exp_end)
			
 
				+			if (exp_end[worker_ctx][nimpl] > max_exp_end)
			
 
				 				/* This placement will make the computation
			
 
				 				 * longer, take into account the idle
			
 
				 				 * consumption of other cpus */
			
 
				-				fitness[worker_ctx] += dt->_gamma * dt->idle_power * (exp_end[worker_ctx] - max_exp_end) / 1000000.0;
			
 
				+				fitness[worker_ctx][nimpl] += dt->_gamma * dt->idle_power * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
			
 
				 
			
 
				-			if (best == -1 || fitness[worker_ctx] < best_fitness)
			
 
				+			if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
			
 
				 			{
			
 
				 				/* we found a better solution */
			
 
				-				best_fitness = fitness[worker_ctx];
			
 
				+				best_fitness = fitness[worker_ctx][nimpl];
			
 
				 				best = worker;
			
 
				 				best_ctx = worker_ctx;
			
 
				+				best_impl = nimpl;
			
 
				 
			
 
				-	//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker], local_power[worker]);
			
 
				+				//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_power[worker][nimpl]);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	STARPU_ASSERT(forced_best != -1 || best != -1);
			
 
				-	
			
 
				+
			
 
				 	if (forced_best != -1)
			
 
				 	{
			
 
				 		/* there is no prediction available for that task
			
@@ -565,10 +570,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 		model_best = 0.0;
			
 
				 		//penality_best = 0.0;
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				-		model_best = local_task_length[best];
			
 
				-		//penality_best = local_data_penalty[best];
			
 
				+		model_best = local_task_length[best_ctx][best_impl];
			
 
				+		//penality_best = local_data_penalty[best_ctx][best_impl];
			
 
				 	}
			
 
				 
			
 
				 
			
@@ -581,7 +586,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 
			
 
				 static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 {
			
 
				-	return _dmda_push_task(task, 2, sched_ctx_id);
			
 
				+	return _dmda_push_task(task, 1, sched_ctx_id);
			
 
				 }
			
 
				 
			
 
				 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
@@ -680,11 +685,13 @@ static void deinitialize_dmda_policy(unsigned sched_ctx_id)
 
				 }
			
 
				 
			
 
				 /* TODO: use post_exec_hook to fix the expected start */
			
 
				-struct starpu_sched_policy_s _starpu_sched_dm_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_dm_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_dmda_policy,
			
 
				 	.deinit_sched = deinitialize_dmda_policy,
			
 
				-	.push_task = dm_push_task, 
			
 
				+	.push_task = dm_push_task,
			
 
				 	.pop_task = dmda_pop_task,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = dmda_pop_every_task,
			
 
				 	.policy_name = "dm",
			
@@ -692,11 +699,13 @@ struct starpu_sched_policy_s _starpu_sched_dm_policy = {
 
				 	.init_sched_for_workers = initialize_dmda_policy_for_workers
			
 
				 };
			
 
				 
			
 
				-struct starpu_sched_policy_s _starpu_sched_dmda_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_dmda_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_dmda_policy,
			
 
				 	.deinit_sched = deinitialize_dmda_policy,
			
 
				-	.push_task = dmda_push_task, 
			
 
				+	.push_task = dmda_push_task,
			
 
				 	.pop_task = dmda_pop_task,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = dmda_pop_every_task,
			
 
				 	.policy_name = "dmda",
			
@@ -704,11 +713,13 @@ struct starpu_sched_policy_s _starpu_sched_dmda_policy = {
 
				 	.init_sched_for_workers = initialize_dmda_policy_for_workers
			
 
				 };
			
 
				 
			
 
				-struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_dmda_sorted_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_dmda_sorted_policy,
			
 
				 	.deinit_sched = deinitialize_dmda_policy,
			
 
				-	.push_task = dmda_push_sorted_task, 
			
 
				+	.push_task = dmda_push_sorted_task,
			
 
				 	.pop_task = dmda_pop_ready_task,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = dmda_pop_every_task,
			
 
				 	.policy_name = "dmdas",
			
@@ -716,11 +727,13 @@ struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy = {
 
				 	.init_sched_for_workers = initialize_dmda_policy_for_workers
			
 
				 };
			
 
				 
			
 
				-struct starpu_sched_policy_s _starpu_sched_dmda_ready_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_dmda_ready_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_dmda_policy,
			
 
				 	.deinit_sched = deinitialize_dmda_policy,
			
 
				-	.push_task = dmda_push_task, 
			
 
				+	.push_task = dmda_push_task,
			
 
				 	.pop_task = dmda_pop_ready_task,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = dmda_pop_every_task,
			
 
				 	.policy_name = "dmdar",
			
--- a/src/sched_policies/deque_queues.c
+++ b/src/sched_policies/deque_queues.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -25,13 +25,13 @@
 
				 #include <errno.h>
			
 
				 #include <common/utils.h>
			
 
				 
			
 
				-struct starpu_deque_jobq_s *_starpu_create_deque(void)
			
 
				+struct _starpu_deque_jobq *_starpu_create_deque(void)
			
 
				 {
			
 
				-	struct starpu_deque_jobq_s *deque;
			
 
				-	deque = (struct starpu_deque_jobq_s *) malloc(sizeof(struct starpu_deque_jobq_s));
			
 
				+	struct _starpu_deque_jobq *deque;
			
 
				+	deque = (struct _starpu_deque_jobq *) malloc(sizeof(struct _starpu_deque_jobq));
			
 
				 
			
 
				 	/* note that not all mechanisms (eg. the semaphore) have to be used */
			
 
				-	deque->jobq = starpu_job_list_new();
			
 
				+	deque->jobq = _starpu_job_list_new();
			
 
				 	deque->njobs = 0;
			
 
				 	deque->nprocessed = 0;
			
 
				 
			
@@ -42,25 +42,25 @@ struct starpu_deque_jobq_s *_starpu_create_deque(void)
 
				 	return deque;
			
 
				 }
			
 
				 
			
 
				-void _starpu_destroy_deque(struct starpu_deque_jobq_s *deque)
			
 
				+void _starpu_destroy_deque(struct _starpu_deque_jobq *deque)
			
 
				 {
			
 
				-	starpu_job_list_delete(deque->jobq);
			
 
				+	_starpu_job_list_delete(deque->jobq);
			
 
				 	free(deque);
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_get_deque_njobs(struct starpu_deque_jobq_s *deque_queue)
			
 
				+unsigned _starpu_get_deque_njobs(struct _starpu_deque_jobq *deque_queue)
			
 
				 {
			
 
				 	return deque_queue->njobs;
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_get_deque_nprocessed(struct starpu_deque_jobq_s *deque_queue)
			
 
				+int _starpu_get_deque_nprocessed(struct _starpu_deque_jobq *deque_queue)
			
 
				 {
			
 
				 	return deque_queue->nprocessed;
			
 
				 }
			
 
				 
			
 
				-struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_queue, int workerid __attribute__ ((unused)))
			
 
				+struct starpu_task *_starpu_deque_pop_task(struct _starpu_deque_jobq *deque_queue, int workerid)
			
 
				 {
			
 
				-	starpu_job_t j = NULL;
			
 
				+	struct _starpu_job *j = NULL;
			
 
				 
			
 
				 	if ((deque_queue->njobs == 0) && _starpu_machine_is_running())
			
 
				 	{
			
@@ -68,66 +68,72 @@ struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_que
 
				 	}
			
 
				 
			
 
				 	/* TODO find a task that suits workerid */
			
 
				-	if (deque_queue->njobs > 0) 
			
 
				+	for (j  = _starpu_job_list_begin(deque_queue->jobq);
			
 
				+	     j != _starpu_job_list_end(deque_queue->jobq);
			
 
				+	     j  = _starpu_job_list_next(j))
			
 
				 	{
			
 
				-		/* there is a task */
			
 
				-		j = starpu_job_list_pop_front(deque_queue->jobq);
			
 
				-	
			
 
				+		unsigned nimpl;
			
 
				 		STARPU_ASSERT(j);
			
 
				-		deque_queue->njobs--;
			
 
				-		
			
 
				-		STARPU_TRACE_JOB_POP(j, 0);
			
 
				+
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+			if (starpu_worker_can_execute_task(workerid, j->task, nimpl))
			
 
				+			{
			
 
				+				j->nimpl = nimpl;
			
 
				+				j = _starpu_job_list_pop_front(deque_queue->jobq);
			
 
				+				_STARPU_TRACE_JOB_POP(j, 0);
			
 
				+				return j->task;
			
 
				+			}
			
 
				 	}
			
 
				-	
			
 
				-	return j->task;
			
 
				+
			
 
				+	return NULL;
			
 
				 }
			
 
				 
			
 
				-struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_s *deque_queue, pthread_mutex_t *sched_mutex, int workerid)
			
 
				+struct _starpu_job_list *_starpu_deque_pop_every_task(struct _starpu_deque_jobq *deque_queue, pthread_mutex_t *sched_mutex, int workerid)
			
 
				 {
			
 
				-	struct starpu_job_list_s *new_list, *old_list;
			
 
				+	struct _starpu_job_list *new_list, *old_list;
			
 
				 
			
 
				 	/* block until some task is available in that queue */
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 
			
 
				 	if (deque_queue->njobs == 0)
			
 
				 	{
			
 
				 		new_list = NULL;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* there is a task */
			
 
				 		old_list = deque_queue->jobq;
			
 
				-		new_list = starpu_job_list_new();
			
 
				+		new_list = _starpu_job_list_new();
			
 
				 
			
 
				 		unsigned new_list_size = 0;
			
 
				 
			
 
				-		starpu_job_itor_t i;
			
 
				-		starpu_job_t next_job;
			
 
				+		struct _starpu_job *i;
			
 
				+		struct _starpu_job *next_job;
			
 
				 		/* note that this starts at the _head_ of the list, so we put
			
 
				  		 * elements at the back of the new list */
			
 
				-		for(i = starpu_job_list_begin(old_list);
			
 
				-			i != starpu_job_list_end(old_list);
			
 
				+		for(i = _starpu_job_list_begin(old_list);
			
 
				+			i != _starpu_job_list_end(old_list);
			
 
				 			i  = next_job)
			
 
				 		{
			
 
				-			next_job = starpu_job_list_next(i);
			
 
				+			unsigned nimpl;
			
 
				+			next_job = _starpu_job_list_next(i);
			
 
				 
			
 
				-			/* In case there are multiples implementations of the
			
 
				- 			 * codelet for a single device, We dont really care
			
 
				-			 * about the implementation used, so let's try the 
			
 
				-			 * first one. */
			
 
				-			if (starpu_worker_may_execute_task(workerid, i->task, 0))
			
 
				+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+			if (starpu_worker_can_execute_task(workerid, i->task, nimpl))
			
 
				 			{
			
 
				 				/* this elements can be moved into the new list */
			
 
				 				new_list_size++;
			
 
				-				
			
 
				-				starpu_job_list_erase(old_list, i);
			
 
				-				starpu_job_list_push_back(new_list, i);
			
 
				+
			
 
				+				_starpu_job_list_erase(old_list, i);
			
 
				+				_starpu_job_list_push_back(new_list, i);
			
 
				+				i->nimpl = nimpl;
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				 		if (new_list_size == 0)
			
 
				 		{
			
 
				 			/* the new list is empty ... */
			
 
				-			starpu_job_list_delete(new_list);
			
 
				+			_starpu_job_list_delete(new_list);
			
 
				 			new_list = NULL;
			
 
				 		}
			
 
				 		else
			
@@ -135,8 +141,8 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_
 
				 			deque_queue->njobs -= new_list_size;
			
 
				 		}
			
 
				 	}
			
 
				-	
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 
			
 
				 	return new_list;
			
 
				 }
			
--- a/src/sched_policies/deque_queues.h
+++ b/src/sched_policies/deque_queues.h
@@ -23,15 +23,16 @@
 
				 #include <common/config.h>
			
 
				 #include <core/jobs.h>
			
 
				 
			
 
				-struct starpu_deque_jobq_s {
			
 
				+struct _starpu_deque_jobq
			
 
				+{
			
 
				 	/* the actual list */
			
 
				-	starpu_job_list_t jobq;
			
 
				+	struct _starpu_job_list *jobq;
			
 
				 
			
 
				 	/* the number of tasks currently in the queue */
			
 
				 	unsigned njobs;
			
 
				 
			
 
				 	/* the number of tasks that were processed */
			
 
				-	unsigned nprocessed;
			
 
				+	int nprocessed;
			
 
				 
			
 
				 	/* only meaningful if the queue is only used by a single worker */
			
 
				 	double exp_start; /* Expected start date of first task in the queue */
			
@@ -39,14 +40,14 @@ struct starpu_deque_jobq_s {
 
				 	double exp_len; /* Expected duration of the set of tasks in the queue */
			
 
				 };
			
 
				 
			
 
				-struct starpu_deque_jobq_s *_starpu_create_deque(void);
			
 
				-void _starpu_destroy_deque(struct starpu_deque_jobq_s *deque);
			
 
				+struct _starpu_deque_jobq *_starpu_create_deque(void);
			
 
				+void _starpu_destroy_deque(struct _starpu_deque_jobq *deque);
			
 
				 
			
 
				-struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_queue, int workerid);
			
 
				-struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_s *deque_queue, pthread_mutex_t *sched_mutex, int workerid);
			
 
				+struct starpu_task *_starpu_deque_pop_task(struct _starpu_deque_jobq *deque_queue, int workerid);
			
 
				+struct _starpu_job_list *_starpu_deque_pop_every_task(struct _starpu_deque_jobq *deque_queue, pthread_mutex_t *sched_mutex, int workerid);
			
 
				 
			
 
				-unsigned _starpu_get_deque_njobs(struct starpu_deque_jobq_s *deque_queue);
			
 
				-unsigned _starpu_get_deque_nprocessed(struct starpu_deque_jobq_s *deque_queue);
			
 
				+unsigned _starpu_get_deque_njobs(struct _starpu_deque_jobq *deque_queue);
			
 
				+int _starpu_get_deque_nprocessed(struct _starpu_deque_jobq *deque_queue);
			
 
				 
			
 
				 
			
 
				 #endif // __DEQUE_QUEUES_H__
			
--- a/src/sched_policies/detect_combined_workers.c
+++ b/src/sched_policies/detect_combined_workers.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011, 2012       Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,12 +19,14 @@
 
				 #include <starpu.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/workers.h>
			
 
				+#include <math.h>
			
 
				+#include <sched_policies/detect_combined_workers.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 #include <hwloc.h>
			
 
				 
			
 
				-/* tree_t
			
 
				- * ======
			
 
				+/* struct _starpu_tree
			
 
				+ * ==================
			
 
				  * Purpose
			
 
				  * =======
			
 
				  * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
			
@@ -39,12 +42,12 @@
 
				  * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
			
 
				  */
			
 
				 
			
 
				-typedef struct tree_s{
			
 
				+struct _starpu_tree
			
 
				+{
			
 
				     hwloc_obj_t obj;
			
 
				     unsigned nb_workers;
			
 
				     int *workers;
			
 
				-} tree_t;
			
 
				-
			
 
				+};
			
 
				 
			
 
				 /* gather_trees
			
 
				  * ============
			
@@ -65,7 +68,7 @@ typedef struct tree_s{
 
				  *			Number of trees we want to combine (size of the array).
			
 
				  */
			
 
				 
			
 
				-static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_source_trees)
			
 
				+static void gather_trees(struct _starpu_tree *target_tree, struct _starpu_tree *source_trees, unsigned nb_source_trees)
			
 
				 {
			
 
				     unsigned tree_id, worker_id, index = 0;
			
 
				     for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
			
@@ -99,7 +102,7 @@ static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_
 
				  *			Maximum size of a combined worker.
			
 
				  */
			
 
				 
			
 
				-static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_size, int max_size)
			
 
				+static unsigned assign_multiple_trees(struct _starpu_tree *trees, unsigned nb_trees, unsigned int min_size, unsigned int max_size)
			
 
				 {
			
 
				     unsigned short complete = 0;
			
 
				     unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
			
@@ -197,19 +200,19 @@ static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_
 
				  *			Maximum size of a combined worker.
			
 
				  */
			
 
				 
			
 
				-static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree, int min_size, int max_size)
			
 
				+static unsigned find_and_assign_combinations_with_hwloc_recursive(struct _starpu_tree *tree, unsigned int min_size, unsigned int max_size)
			
 
				 {
			
 
				     unsigned subtree_id, nb_workers = 0;
			
 
				 
			
 
				     hwloc_obj_t obj = tree->obj;
			
 
				     int *workers = tree->workers;
			
 
				 
			
 
				-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+    struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				     /* Is this a leaf ? (eg. a PU for hwloc) */
			
 
				     if (!hwloc_compare_types(config->cpu_depth, obj->depth))
			
 
				     {
			
 
				-	struct starpu_worker_s *worker = obj->userdata;
			
 
				+	struct _starpu_worker *worker = obj->userdata;
			
 
				 
			
 
				 	/* If this is a CPU worker add it at the beginning
			
 
				 	 * of the array , write 1 in the field nb_workers and
			
@@ -229,7 +232,7 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
 
				     /* If there is only one child, we go to the next level right away */
			
 
				     if (obj->arity == 1)
			
 
				     {
			
 
				-	tree_t subtree = *tree;
			
 
				+	struct _starpu_tree subtree = *tree;
			
 
				 	subtree.obj = obj->children[0];
			
 
				 	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
			
 
				 	tree->nb_workers = nb_workers;
			
@@ -240,12 +243,12 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
 
				      * CPU leaves that fits between min and max. */
			
 
				 
			
 
				     /* We allocate an array of tree structures which will contain the current node's subtrees data */
			
 
				-    tree_t *subtrees = (tree_t *) malloc(obj->arity * sizeof(tree_t));
			
 
				+    struct _starpu_tree *subtrees = (struct _starpu_tree *) malloc(obj->arity * sizeof(struct _starpu_tree));
			
 
				 
			
 
				     /* We allocate the array containing the workers of each subtree and initialize the fields left */
			
 
				     for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
			
 
				     {
			
 
				-	tree_t *subtree = subtrees + subtree_id;
			
 
				+	struct _starpu_tree *subtree = subtrees + subtree_id;
			
 
				 
			
 
				 	subtree->obj = obj->children[subtree_id];
			
 
				 	subtree->nb_workers = 0;
			
@@ -317,7 +320,7 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
 
				  *			Topology of the machine : used to know the number of cpus.
			
 
				  */
			
 
				 
			
 
				-static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machine_topology_s *topology)
			
 
				+static void get_min_max_sizes(unsigned int *min_size, unsigned int *max_size, struct starpu_machine_topology *topology)
			
 
				 {
			
 
				     int _min_size, _max_size;
			
 
				 
			
@@ -330,8 +333,8 @@ static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machin
 
				     {
			
 
				 
			
 
				 	int nb_cpus = topology->nhwcpus;
			
 
				-	int sqrt_nb_cpus = sqrt(nb_cpus);
			
 
				-	short exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
			
 
				+	int sqrt_nb_cpus = (int)sqrt((double)nb_cpus);
			
 
				+	int exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
			
 
				 
			
 
				 	    if(_min_size == -1)
			
 
				 	    {
			
@@ -373,19 +376,19 @@ static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machin
 
				  *			to get the hwloc tree.
			
 
				  */
			
 
				 
			
 
				-static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
			
 
				+static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology *topology)
			
 
				 {
			
 
				     unsigned nb_workers;
			
 
				-    int min_size, max_size;
			
 
				+    unsigned int min_size, max_size;
			
 
				 
			
 
				     get_min_max_sizes(&min_size, &max_size, topology);
			
 
				 
			
 
				     STARPU_ASSERT(min_size <= max_size);
			
 
				 
			
 
				-    tree_t tree;
			
 
				+    struct _starpu_tree tree;
			
 
				 
			
 
				     /* Of course we start from the root */
			
 
				-    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
			
 
				+    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0);
			
 
				     tree.nb_workers = 0;
			
 
				     tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
			
 
				 
			
@@ -399,7 +402,7 @@ static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topolo
 
				     {
			
 
				 	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
			
 
				 	 * while there are enough workers to assign regarding the min_size value */
			
 
				-	STARPU_ASSERT(nb_workers < max_size);
			
 
				+	STARPU_ASSERT(nb_workers <= max_size);
			
 
				 
			
 
				 	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
			
 
				 	STARPU_ASSERT(ret >= 0);
			
@@ -410,9 +413,9 @@ static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topolo
 
				 
			
 
				 #else /* STARPU_HAVE_HWLOC */
			
 
				 
			
 
				-static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
			
 
				+static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology *topology)
			
 
				 {
			
 
				-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+    struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				     /* We put the id of all CPU workers in this array */
			
 
				     int cpu_workers[STARPU_NMAXWORKERS];
			
@@ -440,7 +443,7 @@ static void find_and_assign_combinations_without_hwloc(struct starpu_machine_top
 
				 
			
 
				 		/* We register this combination */
			
 
				 		int ret;
			
 
				-		ret = starpu_combined_worker_assign_workerid(size, workerids); 
			
 
				+		ret = starpu_combined_worker_assign_workerid(size, workerids);
			
 
				 		STARPU_ASSERT(ret >= 0);
			
 
				 	    }
			
 
				 	}
			
@@ -449,9 +452,9 @@ static void find_and_assign_combinations_without_hwloc(struct starpu_machine_top
 
				 
			
 
				 #endif /* STARPU_HAVE_HWLOC */
			
 
				 
			
 
				-static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
			
 
				+static void combine_all_cpu_workers(struct starpu_machine_topology *topology)
			
 
				 {
			
 
				-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+    struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				     int cpu_workers[STARPU_NMAXWORKERS];
			
 
				     unsigned ncpus = 0;
			
@@ -463,21 +466,22 @@ static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 
				 	    cpu_workers[ncpus++] = i;
			
 
				     }
			
 
				 
			
 
				-    if (ncpus > 0)
			
 
				+    for (i = 1; i <= ncpus; i++)
			
 
				     {
			
 
				 	int ret;
			
 
				-	ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
			
 
				+	ret = starpu_combined_worker_assign_workerid(i, cpu_workers);
			
 
				 	STARPU_ASSERT(ret >= 0);
			
 
				     }
			
 
				 }
			
 
				 
			
 
				-void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
			
 
				+void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology)
			
 
				 {
			
 
				-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+    struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				-    if (config->user_conf && config->user_conf->single_combined_worker > 0 || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
			
 
				+    if ((config->user_conf && config->user_conf->single_combined_worker > 0) || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
			
 
				 	combine_all_cpu_workers(topology);
			
 
				-    else {
			
 
				+    else
			
 
				+    {
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	find_and_assign_combinations_with_hwloc(topology);
			
 
				 #else
			
--- a/src/sched_policies/detect_combined_workers.h
+++ b/src/sched_policies/detect_combined_workers.h
@@ -0,0 +1,21 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011 Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+/* Initialize combined workers */
			
 
				+void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology);
			
 
				+
			
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -25,7 +25,7 @@
 
				 #include <sched_policies/fifo_queues.h>
			
 
				 
			
 
				 typedef struct eager_center_policy_data {
			
 
				-	struct starpu_fifo_taskq_s *fifo;
			
 
				+	struct _starpu_fifo_taskq *fifo;
			
 
				 	pthread_mutex_t sched_mutex;
			
 
				 	pthread_cond_t sched_cond;
			
 
				 } eager_center_policy_data;
			
@@ -109,7 +109,7 @@ static int push_task_eager_policy(struct starpu_task *task, unsigned sched_ctx_i
 
				 		_starpu_increment_nsubmitted_tasks_of_worker(workerid);
			
 
				 	}
			
 
				 
			
 
				-	struct starpu_fifo_taskq_s *fifo = data->fifo;
			
 
				+	struct _starpu_fifo_taskq *fifo = data->fifo;
			
 
				 	return _starpu_fifo_push_task(fifo, &data->sched_mutex, &data->sched_cond, task);
			
 
				 }
			
 
				 
			
@@ -118,17 +118,17 @@ static struct starpu_task *pop_every_task_eager_policy(unsigned sched_ctx_id)
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	struct eager_center_policy_data *data = (struct eager_center_policy_data*)sched_ctx->policy_data;
			
 
				 
			
 
				-	struct starpu_fifo_taskq_s *fifo = data->fifo;
			
 
				+	static struct _starpu_fifo_taskq *fifo = data->fifo;
			
 
				 	return _starpu_fifo_pop_every_task(fifo, &data->sched_mutex, starpu_worker_get_id());
			
 
				 }
			
 
				 
			
 
				 static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
			
 
				 {
			
 
				-        unsigned workerid = starpu_worker_get_id();
			
 
				+    unsigned workerid = starpu_worker_get_id();
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	struct eager_center_policy_data *data = (struct eager_center_policy_data*)sched_ctx->policy_data;
			
 
				 
			
 
				-	struct starpu_fifo_taskq_s *fifo = data->fifo;
			
 
				+	static struct _starpu_fifo_taskq *fifo = data->fifo;
			
 
				 	struct starpu_task *task =  _starpu_fifo_pop_task(fifo, workerid);
			
 
				 
			
 
				 	if(task)
			
@@ -144,12 +144,14 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				-struct starpu_sched_policy_s _starpu_sched_eager_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_eager_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_eager_center_policy,
			
 
				 	.init_sched_for_workers = initialize_eager_center_policy_for_workers,
			
 
				 	.deinit_sched = deinitialize_eager_center_policy,
			
 
				 	.push_task = push_task_eager_policy,
			
 
				 	.pop_task = pop_task_eager_policy,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = pop_every_task_eager_policy,
			
 
				 	.policy_name = "eager",
			
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -32,8 +32,9 @@
 
				 
			
 
				 #define NPRIO_LEVELS	(MAX_LEVEL - MIN_LEVEL + 1)
			
 
				 
			
 
				-struct starpu_priority_taskq_s {
			
 
				-	/* the actual lists 
			
 
				+struct starpu_priority_taskq_s
			
 
				+{
			
 
				+	/* the actual lists
			
 
				 	 *	taskq[p] is for priority [p - STARPU_MIN_PRIO] */
			
 
				 	struct starpu_task_list taskq[NPRIO_LEVELS];
			
 
				 	unsigned ntasks[NPRIO_LEVELS];
			
@@ -42,19 +43,19 @@ struct starpu_priority_taskq_s {
 
				 };
			
 
				 
			
 
				 typedef struct eager_central_prio_data{
			
 
				-	struct starpu_priority_taskq_s *taskq;
			
 
				+	struct _starpu_priority_taskq *taskq;
			
 
				 	pthread_mutex_t sched_mutex;
			
 
				 	pthread_cond_t sched_cond;
			
 
				 } eager_central_prio_data;
			
 
				 
			
 
				 /*
			
 
				- * Centralized queue with priorities 
			
 
				+ * Centralized queue with priorities
			
 
				  */
			
 
				 
			
 
				 static struct starpu_priority_taskq_s *_starpu_create_priority_taskq(void)
			
 
				 {
			
 
				 	struct starpu_priority_taskq_s *central_queue;
			
 
				-	
			
 
				+
			
 
				 	central_queue = (struct starpu_priority_taskq_s *) malloc(sizeof(struct starpu_priority_taskq_s));
			
 
				 	central_queue->total_ntasks = 0;
			
 
				 
			
@@ -152,24 +153,25 @@ static int _starpu_priority_push_task(struct starpu_task *task, unsigned sched_c
 
				 	struct starpu_priority_taskq_s *taskq = data->taskq;
			
 
				 
			
 
				 	/* wake people waiting for a task */
			
 
				-	PTHREAD_MUTEX_LOCK(&data->sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&data->sched_mutex);
			
 
				+
			
 
				+	_STARPU_TRACE_JOB_PUSH(task, 1);
			
 
				 
			
 
				-	STARPU_TRACE_JOB_PUSH(task, 1);
			
 
				-	
			
 
				 	unsigned priolevel = task->priority - STARPU_MIN_PRIO;
			
 
				 
			
 
				 	starpu_task_list_push_front(&taskq->taskq[priolevel], task);
			
 
				 	taskq->ntasks[priolevel]++;
			
 
				 	taskq->total_ntasks++;
			
 
				 
			
 
				-	PTHREAD_COND_SIGNAL(&data->sched_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&data->sched_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
			
 
				 {
			
 
				+	/* XXX FIXME: should call starpu_worker_can_execute_task!! */
			
 
				 	struct starpu_task *task = NULL;
			
 
				 
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
@@ -183,39 +185,45 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
				 	if ((taskq->total_ntasks == 0) && _starpu_machine_is_running())
			
 
				 	{
			
 
				 #ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				-		PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
			
 
				 		return NULL;
			
 
				 #else
			
 
				-		PTHREAD_COND_WAIT(&data->sched_cond, &data->sched_mutex);
			
 
				+		_STARPU_PTHREAD_COND_WAIT(&data->sched_cond, &data->sched_mutex);
			
 
				 #endif
			
 
				 	}
			
 
				 
			
 
				 	if (taskq->total_ntasks > 0)
			
 
				 	{
			
 
				 		unsigned priolevel = NPRIO_LEVELS - 1;
			
 
				-		do {
			
 
				-			if (taskq->ntasks[priolevel] > 0) {
			
 
				+		do
			
 
				+		{
			
 
				+			if (taskq->ntasks[priolevel] > 0)
			
 
				+			{
			
 
				 				/* there is some task that we can grab */
			
 
				 				task = starpu_task_list_pop_back(&taskq->taskq[priolevel]);
			
 
				 				taskq->ntasks[priolevel]--;
			
 
				 				taskq->total_ntasks--;
			
 
				-				STARPU_TRACE_JOB_POP(task, 0);
			
 
				+				_STARPU_TRACE_JOB_POP(task, 0);
			
 
				 			}
			
 
				-		} while (!task && priolevel-- > 0);
			
 
				+		}
			
 
				+		while (!task && priolevel-- > 0);
			
 
				 	}
			
 
				+	STARPU_ASSERT_MSG(starpu_worker_can_execute_task(starpu_worker_get_id(), task, 0), "prio does not support \"can_execute\"");
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
			
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				-struct starpu_sched_policy_s _starpu_sched_prio_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_prio_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_eager_center_priority_policy,
			
 
				 	.init_sched_for_workers = initialize_eager_center_priority_policy_for_workers,
			
 
				 	.deinit_sched = deinitialize_eager_center_priority_policy,
			
 
				 	/* we always use priorities in that policy */
			
 
				 	.push_task = _starpu_priority_push_task,
			
 
				 	.pop_task = _starpu_priority_pop_task,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
 
				 	.policy_name = "prio",
			
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -25,10 +25,10 @@
 
				 #include <core/task.h>
			
 
				 #include <core/workers.h>
			
 
				 
			
 
				-struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
			
 
				+struct _starpu_fifo_taskq *_starpu_create_fifo(void)
			
 
				 {
			
 
				-	struct starpu_fifo_taskq_s *fifo;
			
 
				-	fifo = (struct starpu_fifo_taskq_s *) malloc(sizeof(struct starpu_fifo_taskq_s));
			
 
				+	struct _starpu_fifo_taskq *fifo;
			
 
				+	fifo = (struct _starpu_fifo_taskq *) malloc(sizeof(struct _starpu_fifo_taskq));
			
 
				 
			
 
				 	/* note that not all mechanisms (eg. the semaphore) have to be used */
			
 
				 	starpu_task_list_init(&fifo->taskq);
			
@@ -42,65 +42,73 @@ struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
 
				 	return fifo;
			
 
				 }
			
 
				 
			
 
				-void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo)
			
 
				+void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo)
			
 
				 {
			
 
				 	free(fifo);
			
 
				 }
			
 
				 
			
 
				-/* TODO: revert front/back? */
			
 
				+int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo)
			
 
				+{
			
 
				+	return fifo->ntasks == 0;
			
 
				+}
			
 
				 
			
 
				-int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				+/* TODO: revert front/back? */
			
 
				+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				 {
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 
			
 
				-	STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				+	_STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				 	/* TODO: if prio, put at back */
			
 
				 	starpu_task_list_push_front(&fifo_queue->taskq, task);
			
 
				 	fifo_queue->ntasks++;
			
 
				 	fifo_queue->nprocessed++;
			
 
				 
			
 
				-	PTHREAD_COND_SIGNAL(sched_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo_queue, int workerid __attribute__ ((unused)))
			
 
				+struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
			
 
				 {
			
 
				-	struct starpu_task *task = NULL;
			
 
				+	struct starpu_task *task;
			
 
				 
			
 
				-	if (fifo_queue->ntasks == 0)
			
 
				-		return NULL;
			
 
				-
			
 
				-	/* TODO: find a task that suits workerid */
			
 
				-	if (fifo_queue->ntasks > 0) 
			
 
				+	for (task  = starpu_task_list_begin(&fifo_queue->taskq);
			
 
				+	     task != starpu_task_list_end(&fifo_queue->taskq);
			
 
				+	     task  = starpu_task_list_next(task))
			
 
				 	{
			
 
				-		/* there is a task */
			
 
				-		task = starpu_task_list_pop_back(&fifo_queue->taskq);
			
 
				-	
			
 
				+		unsigned nimpl;
			
 
				 		STARPU_ASSERT(task);
			
 
				-		fifo_queue->ntasks--;
			
 
				-		
			
 
				-		STARPU_TRACE_JOB_POP(task, 0);
			
 
				+
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+			if (starpu_worker_can_execute_task(workerid, task, nimpl))
			
 
				+			{
			
 
				+				_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
			
 
				+				starpu_task_list_erase(&fifo_queue->taskq, task);
			
 
				+				fifo_queue->ntasks--;
			
 
				+				_STARPU_TRACE_JOB_POP(task, 0);
			
 
				+				return task;
			
 
				+			}
			
 
				 	}
			
 
				-	
			
 
				-	return task;
			
 
				+
			
 
				+	return NULL;
			
 
				 }
			
 
				 
			
 
				 /* pop every task that can be executed on the calling driver */
			
 
				-struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, int workerid)
			
 
				+struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, int workerid)
			
 
				 {
			
 
				 	struct starpu_task_list *old_list;
			
 
				 	unsigned size;
			
 
				 
			
 
				 	struct starpu_task *new_list = NULL;
			
 
				 	struct starpu_task *new_list_tail = NULL;
			
 
				-	
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 
			
 
				 	size = fifo_queue->ntasks;
			
 
				 
			
 
				-	if (size > 0) {
			
 
				+	if (size > 0)
			
 
				+	{
			
 
				 		old_list = &fifo_queue->taskq;
			
 
				 		unsigned new_list_size = 0;
			
 
				 
			
@@ -110,13 +118,15 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 
				 		task = starpu_task_list_front(old_list);
			
 
				 		while (task)
			
 
				 		{
			
 
				+			unsigned nimpl;
			
 
				 			next_task = task->next;
			
 
				 
			
 
				-			if (starpu_worker_may_execute_task(workerid, task, 0))
			
 
				+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+			if (starpu_worker_can_execute_task(workerid, task, nimpl))
			
 
				 			{
			
 
				 				/* this elements can be moved into the new list */
			
 
				 				new_list_size++;
			
 
				-				
			
 
				+
			
 
				 				starpu_task_list_erase(old_list, task);
			
 
				 
			
 
				 				if (new_list_tail)
			
@@ -126,21 +136,24 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 
				 					task->next = NULL;
			
 
				 					new_list_tail = task;
			
 
				 				}
			
 
				-				else {
			
 
				+				else
			
 
				+				{
			
 
				 					new_list = task;
			
 
				 					new_list_tail = task;
			
 
				 					task->prev = NULL;
			
 
				 					task->next = NULL;
			
 
				 				}
			
 
				+				_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
			
 
				+				break;
			
 
				 			}
			
 
				-		
			
 
				+
			
 
				 			task = next_task;
			
 
				 		}
			
 
				 
			
 
				 		fifo_queue->ntasks -= new_list_size;
			
 
				 	}
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 
			
 
				 	return new_list;
			
 
				 }
			
--- a/src/sched_policies/fifo_queues.h
+++ b/src/sched_policies/fifo_queues.h
@@ -22,7 +22,8 @@
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				 
			
 
				-struct starpu_fifo_taskq_s {
			
 
				+struct _starpu_fifo_taskq
			
 
				+{
			
 
				 	/* the actual list */
			
 
				 	struct starpu_task_list taskq;
			
 
				 
			
@@ -38,12 +39,14 @@ struct starpu_fifo_taskq_s {
 
				 	double exp_len; /* Expected duration of the set of tasks in the queue */
			
 
				 };
			
 
				 
			
 
				-struct starpu_fifo_taskq_s*_starpu_create_fifo(void);
			
 
				-void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo);
			
 
				+struct _starpu_fifo_taskq*_starpu_create_fifo(void);
			
 
				+void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo);
			
 
				 
			
 
				-int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
			
 
				+int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
			
 
				 
			
 
				-struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo, int workerid);
			
 
				-struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, int workerid);
			
 
				+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
			
 
				+
			
 
				+struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);
			
 
				+struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, int workerid);
			
 
				 
			
 
				 #endif // __FIFO_QUEUES_H__
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
@@ -23,10 +23,25 @@
 
				 
			
 
				 #include <core/workers.h>
			
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				+#include <core/task_bundle.h>
			
 
				+#include <core/workers.h>
			
 
				 #include <starpu_parameters.h>
			
 
				 #include <starpu_task_bundle.h>
			
 
				 #include <starpu_top.h>
			
 
				 
			
 
				+#ifndef DBL_MIN
			
 
				+#define DBL_MIN __DBL_MIN__
			
 
				+#endif
			
 
				+
			
 
				+#ifndef DBL_MAX
			
 
				+#define DBL_MAX __DBL_MAX__
			
 
				+#endif
			
 
				+
			
 
				+static double exp_start[STARPU_NMAXWORKERS]; /* of the first queued task */
			
 
				+static double exp_end[STARPU_NMAXWORKERS];   /* of the set of queued tasks */
			
 
				+static double exp_len[STARPU_NMAXWORKERS];   /* of the last queued task */
			
 
				+static double ntasks[STARPU_NMAXWORKERS];
			
 
				+
			
 
				 typedef struct {
			
 
				 	double alpha;
			
 
				 	double beta;
			
@@ -34,12 +49,6 @@ typedef struct {
 
				 	double idle_power;
			
 
				 } heft_data;
			
 
				 
			
 
				-static double exp_start[STARPU_NMAXWORKERS];	/* of the first queued task */
			
 
				-static double exp_end[STARPU_NMAXWORKERS];	/* of the set of queued tasks */
			
 
				-static double exp_len[STARPU_NMAXWORKERS];	/* of the last queued task */
			
 
				-static double ntasks[STARPU_NMAXWORKERS];
			
 
				-
			
 
				-
			
 
				 const float alpha_minimum=0;
			
 
				 const float alpha_maximum=10.0;
			
 
				 const float beta_minimum=0;
			
@@ -49,7 +58,8 @@ const float gamma_maximum=10000.0;
 
				 const float idle_power_minimum=0;
			
 
				 const float idle_power_maximum=10000.0;
			
 
				 
			
 
				-void param_modified(struct starputop_param_t* d){
			
 
				+static void param_modified(struct starpu_top_param* d)
			
 
				+{
			
 
				 	//just to show parameter modification
			
 
				 	fprintf(stderr,"%s has been modified : %f !\n", d->name, d->value);
			
 
				 }
			
@@ -125,13 +135,16 @@ static void heft_init(unsigned sched_ctx_id)
 
				 	starputop_register_parameter_float("HEFT_IDLE_POWER", &hd->idle_power, idle_power_minimum,idle_power_maximum,param_modified);
			
 
				 }
			
 
				 
			
 
				-static void heft_post_exec_hook(struct starpu_task *task)
			
 
				+
			
 
				+/* heft_pre_exec_hook is called right after the data transfer is done and right before
			
 
				+ * the computation to begin, it is useful to update more precisely the value
			
 
				+ * of the expected start, end, length, etc... */
			
 
				+static void heft_pre_exec_hook(struct starpu_task *task)
			
 
				 {
			
 
				 	unsigned sched_ctx_id = task->sched_ctx;
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				-	STARPU_ASSERT(workerid >= 0);
			
 
				-
			
 
				 	double model = task->predicted;
			
 
				+	double transfer_model = task->predicted_transfer;
			
 
				 
			
 
				 	pthread_mutex_t *sched_mutex;
			
 
				 	pthread_cond_t *sched_cond;
			
@@ -144,18 +157,21 @@ static void heft_post_exec_hook(struct starpu_task *task)
 
				 		sched_cond = &workerarg->sched_cond;
			
 
				 		starpu_worker_set_sched_condition(sched_ctx_id, workerid, sched_mutex, sched_cond);
			
 
				 	}
			
 
				-#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				-	starpu_call_poped_task_cb(workerid, sched_ctx_id, task->flops);
			
 
				-#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				-
			
 
				-	/* Once we have executed the task, we can update the predicted amount
			
 
				+	/* Once the task is executing, we can update the predicted amount
			
 
				 	 * of work. */
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				-	exp_len[workerid] -= model;
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+	exp_len[workerid] -= model + transfer_model;
			
 
				 	exp_start[workerid] = starpu_timing_now() + model;
			
 
				 	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
			
 
				 	ntasks[workerid]--;
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+}
			
 
				+
			
 
				+static void heft_post_exec_hook(struct starpu_task *task)
			
 
				+{
			
 
				+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				+	starpu_call_poped_task_cb(workerid, sched_ctx_id, task->flops);
			
 
				+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 }
			
 
				 
			
 
				 static void heft_push_task_notify(struct starpu_task *task, int workerid)
			
@@ -163,10 +179,12 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 
				 	unsigned sched_ctx_id = task->sched_ctx;
			
 
				 	/* Compute the expected penality */
			
 
				 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				+	unsigned memory_node = starpu_worker_get_memory_node(workerid);
			
 
				 
			
 
				 	double predicted = starpu_task_expected_length(task, perf_arch,
			
 
				 			_starpu_get_job_associated_to_task(task)->nimpl);
			
 
				 
			
 
				+	double predicted_transfer = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 	pthread_mutex_t *sched_mutex;
			
 
				 	pthread_cond_t *sched_cond;
			
 
				 	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
			
@@ -184,25 +202,45 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
 
				 	/* Update the predictions */
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 	/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				 	exp_start[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
			
 
				-	exp_end[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
			
 
				+	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
			
 
				 
			
 
				 	/* If there is no prediction available, we consider the task has a null length */
			
 
				-	if (predicted != -1.0)
			
 
				+	if (!isnan(predicted))
			
 
				 	{
			
 
				 		task->predicted = predicted;
			
 
				 		exp_end[workerid] += predicted;
			
 
				 		exp_len[workerid] += predicted;
			
 
				 	}
			
 
				 
			
 
				+	/* If there is no prediction available, we consider the task has a null length */
			
 
				+	if (!isnan(predicted_transfer))
			
 
				+	{
			
 
				+		if (starpu_timing_now() + predicted_transfer < exp_end[workerid])
			
 
				+		{
			
 
				+			/* We may hope that the transfer will be finished by
			
 
				+			 * the start of the task. */
			
 
				+			predicted_transfer = 0;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* The transfer will not be finished by then, take the
			
 
				+			 * remainder into account */
			
 
				+			predicted_transfer = (starpu_timing_now() + predicted_transfer) - exp_end[workerid];
			
 
				+		}
			
 
				+		task->predicted_transfer = predicted_transfer;
			
 
				+		exp_end[workerid] += predicted_transfer;
			
 
				+		exp_len[workerid] += predicted_transfer;
			
 
				+	}
			
 
				+
			
 
				 	ntasks[workerid]++;
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 }
			
 
				 
			
 
				-static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, int prio, unsigned sched_ctx_id)
			
 
				+static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, double predicted_transfer, int prio, unsigned sched_ctx_id)
			
 
				  {
			
 
				 	/* make sure someone coule execute that task ! */
			
 
				 	STARPU_ASSERT(best_workerid != -1);
			
@@ -225,16 +263,38 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	starpu_call_pushed_task_cb(best_workerid, sched_ctx_id);
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+
			
 
				+	/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				+	exp_start[best_workerid] = STARPU_MAX(exp_start[best_workerid], starpu_timing_now());
			
 
				+	exp_end[best_workerid] = exp_start[best_workerid] + exp_len[best_workerid];
			
 
				+
			
 
				 	exp_end[best_workerid] += predicted;
			
 
				 	exp_len[best_workerid] += predicted;
			
 
				+
			
 
				+	if (starpu_timing_now() + predicted_transfer < exp_end[best_workerid])
			
 
				+	{
			
 
				+		/* We may hope that the transfer will be finished by
			
 
				+		 * the start of the task. */
			
 
				+		predicted_transfer = 0;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* The transfer will not be finished by then, take the
			
 
				+		 * remainder into account */
			
 
				+		predicted_transfer = (starpu_timing_now() + predicted_transfer) - exp_end[best_workerid];
			
 
				+	}
			
 
				+	exp_end[best_workerid] += predicted_transfer;
			
 
				+	exp_len[best_workerid] += predicted_transfer;
			
 
				+
			
 
				 	ntasks[best_workerid]++;
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 
			
 
				 	task->predicted = predicted;
			
 
				+	task->predicted_transfer = predicted_transfer;
			
 
				 
			
 
				-	if (starpu_top_status_get())
			
 
				-		starputop_task_prevision(task, best_workerid, 
			
 
				+	if (_starpu_top_status_get())
			
 
				+		_starpu_top_task_prevision(task, best_workerid,
			
 
				 					(unsigned long long)(exp_end[best_workerid]-predicted)/1000,
			
 
				 					(unsigned long long)exp_end[best_workerid]/1000);
			
 
				 
			
@@ -244,29 +304,32 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 		starpu_prefetch_task_input_on_node(task, memory_node);
			
 
				 	}
			
 
				 
			
 
				+
			
 
				+	//_STARPU_DEBUG("Heft : pushing local task\n");
			
 
				 	return starpu_push_local_task(best_workerid, task, prio);
			
 
				 }
			
 
				 
			
 
				+/* TODO: factorize with dmda!! */
			
 
				 static void compute_all_performance_predictions(struct starpu_task *task,
			
 
				 					double *local_task_length, double *exp_end,
			
 
				 					double *max_exp_endp, double *best_exp_endp,
			
 
				 					double *local_data_penalty,
			
 
				-					double *local_power, int *forced_best,
			
 
				-					struct starpu_task_bundle *bundle,
			
 
				+					double *local_power, 
			
 
				+					int *forced_worker, int *forced_impl,
			
 
				+					starpu_task_bundle_t bundle,
			
 
				 					unsigned sched_ctx_id)
			
 
				 {
			
 
				 	int calibrating = 0;
			
 
				 	double max_exp_end = DBL_MIN;
			
 
				 	double best_exp_end = DBL_MAX;
			
 
				 	int ntasks_best = -1;
			
 
				+	int nimpl_best = 0;
			
 
				 	double ntasks_best_end = 0.0;
			
 
				-	
			
 
				+
			
 
				 	/* A priori, we know all estimations */
			
 
				 	int unknown = 0;
			
 
				-	
			
 
				-	unsigned nimpl;
			
 
				-	unsigned best_impl = 0;
			
 
				 	unsigned worker, worker_ctx = 0;
			
 
				+	unsigned nimpl;
			
 
				 
			
 
				 	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
			
 
				 
			
@@ -277,101 +340,142 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 		{
			
 
				 			/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				 			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
			
 
				-			exp_end[worker_ctx] = exp_start[worker] + exp_len[worker];
			
 
				-			if (exp_end[worker_ctx] > max_exp_end)
			
 
				- 				max_exp_end = exp_end[worker_ctx];
			
 
				+			exp_end[worker_ctx][nimpl] = exp_start[worker] + exp_len[worker];
			
 
				+			if (exp_end[worker_ctx][nimpl] > max_exp_end)
			
 
				+ 				max_exp_end = exp_end[worker_ctx][nimpl];
			
 
				 			
			
 
				-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
			
 
				+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
 
				 				continue;
			
 
				 			}
			
 
				-			
			
 
				+
			
 
				 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-			
			
 
				+
			
 
				 			if (bundle)
			
 
				 			{
			
 
				-				local_task_length[worker_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
			
 
				-				local_data_penalty[worker_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				-				local_power[worker_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
			
 
				+				/* TODO : conversion time */
			
 
				+				local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
			
 
				+				local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
			
 
				+				local_power[worker_ctx][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
			
 
				 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
			
 
				 			}
			
 
				 			else 
			
 
				 			{
			
 
				-				local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				-				local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				-				local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
			
 
				+				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
			
 
				+				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+				local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch, nimpl);
			
 
				+				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
			
 
				+				if (conversion_time > 0.0)
			
 
				+					local_task_length[worker_ctx][nimpl] += conversion_time;
			
 
				 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
			
 
				 			}
			
 
				-			
			
 
				+
			
 
				 			double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-			
			
 
				+
			
 
				 			if (ntasks_best == -1
			
 
				-			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-			    || (!calibrating && local_task_length[worker_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-			    || (calibrating && local_task_length[worker_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-				) 
			
 
				+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better worker */
			
 
				+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
			
 
				+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+				)
			
 
				 			{
			
 
				 				ntasks_best_end = ntasks_end;
			
 
				 				ntasks_best = worker;
			
 
				 			}
			
 
				-			
			
 
				-			if (local_task_length[worker_ctx] == -1.0)
			
 
				+
			
 
				+			if (isnan(local_task_length[worker_ctx][nimpl]))
			
 
				 				/* we are calibrating, we want to speed-up calibration time
			
 
				 				 * so we privilege non-calibrated tasks (but still
			
 
				 				 * greedily distribute them to avoid dumb schedules) */
			
 
				 				calibrating = 1;
			
 
				-			
			
 
				-			if (local_task_length[worker_ctx] <= 0.0)
			
 
				+
			
 
				+			if (isnan(local_task_length[worker_ctx][nimpl])
			
 
				+				|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
			
 
				 				/* there is no prediction available for that task
			
 
				-				 * with that arch yet, so switch to a greedy strategy */
			
 
				+				 * with that arch (yet or at all), so switch to a greedy strategy */
			
 
				 				unknown = 1;
			
 
				-			
			
 
				+
			
 
				 			if (unknown)
			
 
				 				continue;
			
 
				 
			
 
				-			exp_end[worker_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_ctx];
			
 
				+			exp_end[worker_ctx][nimpl] = exp_start[worker] + exp_len[worker] + local_task_length[worker_ctx][nimpl];
			
 
				 			
			
 
				-			if (exp_end[worker_ctx] < best_exp_end)
			
 
				+			if (exp_end[worker_ctx][nimpl] < best_exp_end)
			
 
				 			{
			
 
				 				/* a better solution was found */
			
 
				-				best_exp_end = exp_end[worker_ctx];
			
 
				-				best_impl = nimpl;
			
 
				+				best_exp_end = exp_end[worker_ctx][nimpl];
			
 
				+				nimpl_best = nimpl;
			
 
				 			}
			
 
				-			
			
 
				-			if (local_power[worker_ctx] == -1.0)
			
 
				-				local_power[worker_ctx] = 0.;
			
 
				+
			
 
				+			if (isnan(local_power[worker_ctx][nimpl]))
			
 
				+				local_power[worker_ctx][nimpl] = 0.;
			
 
				+
			
 
				 		}
			
 
				 		worker_ctx++;
			
 
				 	}
			
 
				 
			
 
				-	*forced_best = unknown?ntasks_best:-1;
			
 
				+	*forced_worker = unknown?ntasks_best:-1;
			
 
				+	*forced_impl = unknown?nimpl_best:-1;
			
 
				 
			
 
				 	*best_exp_endp = best_exp_end;
			
 
				 	*max_exp_endp = max_exp_end;
			
 
				-	
			
 
				-	/* save the best implementation */
			
 
				-	//_STARPU_DEBUG("Scheduler heft: kernel (%u)\n", best_impl);
			
 
				-	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				 }
			
 
				 
			
 
				+static int push_conversion_tasks(struct starpu_task *task, unsigned int workerid)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	int ret;
			
 
				+	unsigned int node = starpu_worker_get_memory_node(workerid);
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
			
 
				+	for (i = 0; i < task->cl->nbuffers; i++)
			
 
				+	{
			
 
				+		struct starpu_task *conversion_task;
			
 
				+		starpu_data_handle_t handle;
			
 
				+
			
 
				+		handle = task->handles[i];
			
 
				+		if (!_starpu_handle_needs_conversion_task(handle, node))
			
 
				+			continue;
			
 
				+
			
 
				+		conversion_task = _starpu_create_conversion_task(handle, node);
			
 
				+		conversion_task->execute_on_a_specific_worker = 1;
			
 
				+		conversion_task->workerid = workerid;
			
 
				+		conversion_task->mf_skip = 1;
			
 
				+		ret = _starpu_task_submit_conversion_task(conversion_task, workerid);
			
 
				+		STARPU_ASSERT(ret == 0);
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < task->cl->nbuffers; i++)
			
 
				+		task->handles[i]->mf_node = node;
			
 
				+
			
 
				+	task->execute_on_a_specific_worker = 1;
			
 
				+	task->workerid = workerid;
			
 
				+	task->mf_skip= 1;
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[workerid]);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
			
 
				-	unsigned worker, worker_ctx = 0;
			
 
				+	unsigned worker, nimpl, worker_ctx = 0;
			
 
				 	int best = -1, best_id_ctx = -1;
			
 
				-	
			
 
				+	int selected_impl= -1;
			
 
				+
			
 
				 	/* this flag is set if the corresponding worker is selected because
			
 
				 	   there is no performance prediction available yet */
			
 
				-	int forced_best;
			
 
				+	int forced_worker;
			
 
				+	int forced_impl;
			
 
				 	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
			
 
				 
			
 
				 	unsigned nworkers_ctx = workers->nworkers;
			
 
				-	double local_task_length[nworkers_ctx];
			
 
				-	double local_data_penalty[nworkers_ctx];
			
 
				-	double local_power[nworkers_ctx];
			
 
				-	double exp_end[nworkers_ctx];
			
 
				+	double local_task_length[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_data_penalty[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_power[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				 	double max_exp_end = 0.0;
			
 
				 
			
 
				 	double best_exp_end;
			
@@ -381,93 +485,116 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	 *	and detect if there is some calibration that needs to be done.
			
 
				 	 */
			
 
				 
			
 
				-	struct starpu_task_bundle *bundle = task->bundle;
			
 
				-
			
 
				-	if(workers->init_cursor)
			
 
				-		workers->init_cursor(workers);
			
 
				+	starpu_task_bundle_t bundle = task->bundle;
			
 
				 
			
 
				 	compute_all_performance_predictions(task, local_task_length, exp_end,
			
 
				-					    &max_exp_end, &best_exp_end,
			
 
				-					    local_data_penalty,
			
 
				-					    local_power, &forced_best, bundle, sched_ctx_id);
			
 
				+					&max_exp_end, &best_exp_end,
			
 
				+					local_data_penalty,
			
 
				+					local_power, &forced_worker, &forced_impl,
			
 
				+					bundle, sched_ctx_id);
			
 
				+
			
 
				 	/* If there is no prediction available for that task with that arch we
			
 
				 	 * want to speed-up calibration time so we force this measurement */
			
 
				-	if (forced_best != -1){
			
 
				-		return push_task_on_best_worker(task, forced_best, 0.0, prio, sched_ctx_id);
			
 
				+	if (forced_worker != -1)
			
 
				+	{
			
 
				+		_starpu_get_job_associated_to_task(task)->nimpl = forced_impl;
			
 
				+
			
 
				+		if (_starpu_task_uses_multiformat_handles(task) && !task->mf_skip)
			
 
				+		{
			
 
				+			/*
			
 
				+			 * Our task uses multiformat handles, which may need to be converted.
			
 
				+			 */
			
 
				+			push_conversion_tasks(task, forced_worker);
			
 
				+			prio = 0;
			
 
				+		}
			
 
				+
			
 
				+		return push_task_on_best_worker(task, forced_worker, 0.0, 0.0, prio, sched_ctx_id;
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	/*
			
 
				 	 *	Determine which worker optimizes the fitness metric which is a
			
 
				 	 *	trade-off between load-balacing, data locality, and energy
			
 
				 	 *	consumption.
			
 
				 	 */
			
 
				 	
			
 
				-	double fitness[nworkers_ctx];
			
 
				+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				 	double best_fitness = -1;
			
 
				 
			
 
				 	while(workers->has_next(workers))
			
 
				 	{
			
 
				 		worker = workers->get_next(workers);
			
 
				-		if (!starpu_worker_may_execute_task(worker, task, 0))
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
 
				-		        worker_ctx++;
			
 
				-			/* no one on that queue may execute this task */
			
 
				-			continue;
			
 
				-		}
			
 
				+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
			
 
				+			{
			
 
				+				worker_ctx++;
			
 
				+				/* no one on that queue may execute this task */
			
 
				+				continue;
			
 
				+			}
			
 
				 
			
 
				-		fitness[worker_ctx] = hd->alpha*(exp_end[worker_ctx] - best_exp_end) 
			
 
				-				+ hd->beta*(local_data_penalty[worker_ctx])
			
 
				-				+ hd->_gamma*(local_power[worker_ctx]);
			
 
				 
			
 
				-		if (exp_end[worker_ctx] > max_exp_end)
			
 
				+			fitness[worker_ctx][nimpl] = hd->alpha*(exp_end[worker_ctx][nimpl] - best_exp_end) 
			
 
				+						+ hd->beta*(local_data_penalty[worker_ctx][nimpl])
			
 
				+						+ hd->_gamma*(local_power[worker_ctx][nimpl]);
			
 
				+
			
 
				+		if (exp_end[worker_ctx][nimpl] > max_exp_end)
			
 
				 			/* This placement will make the computation
			
 
				 			 * longer, take into account the idle
			
 
				 			 * consumption of other cpus */
			
 
				-			fitness[worker_ctx] += hd->_gamma * hd->idle_power * (exp_end[worker_ctx] - max_exp_end) / 1000000.0;
			
 
				+			fitness[worker_ctx][nimpl] += hd->_gamma * hd->idle_power * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
			
 
				 
			
 
				-		if (best == -1 || fitness[worker_ctx] < best_fitness)
			
 
				+		if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
			
 
				 		{
			
 
				 			/* we found a better solution */
			
 
				-			best_fitness = fitness[worker_ctx];
			
 
				+			best_fitness = fitness[worker_ctx][nimpl];
			
 
				 			best = worker;
			
 
				 			best_id_ctx = worker_ctx;
			
 
				+			selected_impl = nimpl;
			
 
				 		}
			
 
				 		worker_ctx++;
			
 
				 	}
			
 
				 
			
 
				 	/* By now, we must have found a solution */
			
 
				 	STARPU_ASSERT(best != -1);
			
 
				-	
			
 
				+
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				-	double model_best;
			
 
				+	double model_best, transfer_model_best;
			
 
				 
			
 
				 	if (bundle)
			
 
				 	{
			
 
				 		/* If we have a task bundle, we have computed the expected
			
 
				 		 * length for the entire bundle, but not for the task alone. */
			
 
				 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
			
 
				-		model_best = starpu_task_expected_length(task, perf_arch,
			
 
				-				_starpu_get_job_associated_to_task(task)->nimpl);
			
 
				+		unsigned memory_node = starpu_worker_get_memory_node(best);
			
 
				+		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
			
 
				+		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				 		/* Remove the task from the bundle since we have made a
			
 
				 		 * decision for it, and that other tasks should not consider it
			
 
				 		 * anymore. */
			
 
				-		PTHREAD_MUTEX_LOCK(&bundle->mutex);
			
 
				-		int ret = starpu_task_bundle_remove(bundle, task);
			
 
				-		
			
 
				-		/* Perhaps the bundle was destroyed when removing the last
			
 
				-		 * entry */
			
 
				-		if (ret != 1)
			
 
				-			PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
			
 
				+		starpu_task_bundle_remove(bundle, task);
			
 
				 	}
			
 
				 	else 
			
 
				 	{
			
 
				-		model_best = local_task_length[best_id_ctx];
			
 
				+		model_best = local_task_length[best_id_ctx][selected_impl];
			
 
				+		transfer_model_best = local_data_penalty[best_id_ctx][selected_impl];
			
 
				 	}
			
 
				 
			
 
				 	if(workers->init_cursor)
			
 
				 		workers->deinit_cursor(workers);
			
 
				-	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
			
 
				+
			
 
				+	_starpu_get_job_associated_to_task(task)->nimpl = selected_impl;
			
 
				+
			
 
				+	if (_starpu_task_uses_multiformat_handles(task) && !task->mf_skip)
			
 
				+	{
			
 
				+		/*
			
 
				+		 * Our task uses multiformat handles, which may need to be converted.
			
 
				+		 */
			
 
				+		push_conversion_tasks(task, forced_worker);
			
 
				+		prio = 0;
			
 
				+	}
			
 
				+
			
 
				+	return push_task_on_best_worker(task, best, model_best, transfer_model_best, prio, sched_ctx_id);
			
 
				 }
			
 
				 
			
 
				 static int heft_push_task(struct starpu_task *task)
			
@@ -510,13 +637,14 @@ static void heft_deinit(unsigned sched_ctx_id)
 
				 	free(ht);
			
 
				 }
			
 
				 
			
 
				-struct starpu_sched_policy_s heft_policy = {
			
 
				+struct starpu_sched_policy heft_policy = {
			
 
				 	.init_sched = heft_init,
			
 
				 	.deinit_sched = heft_deinit,
			
 
				-	.push_task = heft_push_task, 
			
 
				+	.push_task = heft_push_task,
			
 
				 	.push_task_notify = heft_push_task_notify,
			
 
				 	.pop_task = NULL,
			
 
				 	.pop_every_task = NULL,
			
 
				+	.pre_exec_hook = heft_pre_exec_hook,
			
 
				 	.post_exec_hook = heft_post_exec_hook,
			
 
				 	.add_workers = heft_add_workers	,
			
 
				 	.remove_workers = heft_remove_workers,
			
--- a/src/sched_policies/parallel_greedy.c
+++ b/src/sched_policies/parallel_greedy.c
@@ -19,10 +19,11 @@
 
				 #include <core/workers.h>
			
 
				 #include <sched_policies/fifo_queues.h>
			
 
				 #include <common/barrier.h>
			
 
				+#include <sched_policies/detect_combined_workers.h>
			
 
				 
			
 
				 typedef struct pgreedy_data {
			
 
				-	struct starpu_fifo_taskq_s *fifo;
			
 
				-	struct starpu_fifo_taskq_s *local_fifo[STARPU_NMAXWORKERS];
			
 
				+	struct _starpu_fifo_taskq *fifo;
			
 
				+	struct _starpu_fifo_taskq *local_fifo[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 	int master_id[STARPU_NMAXWORKERS];
			
 
				 
			
@@ -60,7 +61,7 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
			
 
				 	{
			
 
				-    	        workerid = sched_ctx->workerids[workerid_ctx];
			
 
				+		workerid = sched_ctx->workerids[workerid_ctx];
			
 
				 		
			
 
				 		int cnt = possible_combinations_cnt[workerid]++;
			
 
				 		possible_combinations[workerid][cnt] = workerid;
			
@@ -94,15 +95,15 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
			
 
				-	PTHREAD_COND_INIT(&data->sched_cond, NULL);
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
			
 
				+	_STARPU_PTHREAD_COND_INIT(&data->sched_cond, NULL);
			
 
				 
			
 
				 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
			
 
				 	{
			
 
				 		workerid = sched_ctx->workerids[workerid_ctx];
			
 
				 
			
 
				-		PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
			
 
				-		PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
			
 
				+		_STARPU_PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
			
 
				+		_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
			
 
				 	}
			
 
				 
			
 
				 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
			
@@ -204,20 +205,20 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 
				 			if (possible_combinations_size[workerid][i] > best_size)
			
 
				 			{
			
 
				 				int combined_worker = possible_combinations[workerid][i];
			
 
				-				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
			
 
				+				if (starpu_combined_worker_can_execute_task(combined_worker, task, 0))
			
 
				 				{
			
 
				 					best_size = possible_combinations_size[workerid][i];
			
 
				 					best_workerid = combined_worker;
			
 
				 				}
			
 
				 			}
			
 
				-		} 
			
 
				+		}
			
 
				 
			
 
				 		/* In case nobody can execute this task, we let the master
			
 
				 		 * worker take it anyway, so that it can discard it afterward.
			
 
				 		 * */
			
 
				 		if (best_workerid == -1)
			
 
				 			return task;
			
 
				-		
			
 
				+
			
 
				 		/* Is this a basic worker or a combined worker ? */
			
 
				 		int nbasic_workers = (int)starpu_worker_get_count();
			
 
				 		int is_basic_worker = (best_workerid < nbasic_workers);
			
@@ -227,23 +228,24 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 
				 			/* The master is alone */
			
 
				 			return task;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* The master needs to dispatch the task between the
			
 
				 			 * different combined workers */
			
 
				-			struct starpu_combined_worker_s *combined_worker;
			
 
				+			struct _starpu_combined_worker *combined_worker;
			
 
				 			combined_worker = _starpu_get_combined_worker_struct(best_workerid);
			
 
				 			int worker_size = combined_worker->worker_size;
			
 
				 			int *combined_workerid = combined_worker->combined_workerid;
			
 
				 
			
 
				-			starpu_job_t j = _starpu_get_job_associated_to_task(task);
			
 
				+			struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
			
 
				 			j->task_size = worker_size;
			
 
				 			j->combined_workerid = best_workerid;
			
 
				 			j->active_task_alias_count = 0;
			
 
				 
			
 
				 			//fprintf(stderr, "POP -> size %d best_size %d\n", worker_size, best_size);
			
 
				 
			
 
				-			PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
			
 
				-			PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
			
 
				+			_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
			
 
				+			_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
			
 
				 
			
 
				 			/* Dispatch task aliases to the different slaves */
			
 
				 			for (i = 1; i < worker_size; i++)
			
@@ -261,17 +263,20 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 
				 			return master_alias;
			
 
				 		}
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* The worker is a slave */
			
 
				 		return _starpu_fifo_pop_task(data->local_fifo[workerid], workerid);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-struct starpu_sched_policy_s _starpu_sched_pgreedy_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_pgreedy_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_pgreedy_policy,
			
 
				 	.deinit_sched = deinitialize_pgreedy_policy,
			
 
				 	.push_task = push_task_pgreedy_policy,
			
 
				 	.pop_task = pop_task_pgreedy_policy,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
 
				 	.policy_name = "pgreedy",
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -23,10 +24,19 @@
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				 #include <starpu_parameters.h>
			
 
				 #include <common/barrier.h>
			
 
				+#include <sched_policies/detect_combined_workers.h>
			
 
				+
			
 
				+#ifndef DBL_MIN
			
 
				+#define DBL_MIN __DBL_MIN__
			
 
				+#endif
			
 
				+
			
 
				+#ifndef DBL_MAX
			
 
				+#define DBL_MAX __DBL_MAX__
			
 
				+#endif
			
 
				 
			
 
				 static pthread_mutex_t big_lock;
			
 
				 
			
 
				-static unsigned  ncombinedworkers;
			
 
				+static unsigned nworkers, ncombinedworkers;
			
 
				 //static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
			
 
				 //static unsigned napplicable_perf_archtypes = 0;
			
 
				 
			
@@ -50,17 +60,19 @@ static void parallel_heft_post_exec_hook(struct starpu_task *task, unsigned sche
 
				 	int workerid = starpu_worker_get_id();
			
 
				 	double model = task->predicted;
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				-	if (model < 0.0)
			
 
				+	double transfer_model = task->predicted_transfer;
			
 
				+
			
 
				+	if (isnan(model))
			
 
				 		model = 0.0;
			
 
				-	
			
 
				+
			
 
				 	/* Once we have executed the task, we can update the predicted amount
			
 
				 	 * of work. */
			
 
				-	PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[workerid]);
			
 
				-	worker_exp_len[workerid] -= model;
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[workerid]);
			
 
				+	worker_exp_len[workerid] -= model + transfer_model;
			
 
				 	worker_exp_start[workerid] = starpu_timing_now();
			
 
				 	worker_exp_end[workerid] = worker_exp_start[workerid] + worker_exp_len[workerid];
			
 
				 	ntasks[workerid]--;
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[workerid]);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[workerid]);
			
 
				 }
			
 
				 
			
 
				 static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double exp_end_predicted, int prio, struct starpu_sched_ctx *sched_ctx)
			
@@ -73,7 +85,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	int nbasic_workers = sched_ctx->nworkers;
			
 
				 	int is_basic_worker = (best_workerid < nbasic_workers);
			
 
				 
			
 
				-	unsigned memory_node; 
			
 
				+	unsigned memory_node;
			
 
				 	memory_node = starpu_worker_get_memory_node(best_workerid);
			
 
				 
			
 
				 	if (starpu_get_prefetch_flag())
			
@@ -81,33 +93,37 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				 
			
 
				 	if (is_basic_worker)
			
 
				 	{
			
 
				 		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
			
 
				-		worker_exp_len[best_workerid] += exp_end_predicted - worker_exp_end[best_workerid];
			
 
				+		/* TODO */
			
 
				+		task->predicted_transfer = 0;
			
 
				+		worker_exp_len[best_workerid] += task->predicted;
			
 
				 		worker_exp_end[best_workerid] = exp_end_predicted;
			
 
				 		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
			
 
				-	
			
 
				+
			
 
				 		ntasks[best_workerid]++;
			
 
				 
			
 
				 		ret = starpu_push_local_task(best_workerid, task, prio);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* This is a combined worker so we create task aliases */
			
 
				-		struct starpu_combined_worker_s *combined_worker;
			
 
				+		struct _starpu_combined_worker *combined_worker;
			
 
				 		combined_worker = _starpu_get_combined_worker_struct(best_workerid);
			
 
				 		int worker_size = combined_worker->worker_size;
			
 
				 		int *combined_workerid = combined_worker->combined_workerid;
			
 
				 
			
 
				-		starpu_job_t j = _starpu_get_job_associated_to_task(task);
			
 
				+		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
			
 
				 		j->task_size = worker_size;
			
 
				 		j->combined_workerid = best_workerid;
			
 
				 		j->active_task_alias_count = 0;
			
 
				+		task->predicted_transfer = 0;
			
 
				 
			
 
				-		PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
			
 
				-		PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
			
 
				+		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
			
 
				+		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
			
 
				 
			
 
				 		int i;
			
 
				 		for (i = 0; i < worker_size; i++)
			
@@ -116,31 +132,34 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 			int local_worker = combined_workerid[i];
			
 
				 
			
 
				 			alias->predicted = exp_end_predicted - worker_exp_end[local_worker];
			
 
				-	
			
 
				-			worker_exp_len[local_worker] += exp_end_predicted - worker_exp_end[local_worker];
			
 
				+			/* TODO */
			
 
				+			alias->predicted_transfer = 0;
			
 
				+
			
 
				+			worker_exp_len[local_worker] += alias->predicted;
			
 
				 			worker_exp_end[local_worker] = exp_end_predicted;
			
 
				 			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
			
 
				-		
			
 
				+
			
 
				 			ntasks[local_worker]++;
			
 
				-	
			
 
				+
			
 
				 			ret |= starpu_push_local_task(local_worker, alias, prio);
			
 
				 		}
			
 
				 
			
 
				 	}
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static double compute_expected_end(int workerid, double length, int nworkers)
			
 
				+static double compute_expected_end(int workerid, double length)
			
 
				 {
			
 
				 	if (workerid < (int)nworkers)
			
 
				 	{
			
 
				 		/* This is a basic worker */
			
 
				 		return worker_exp_start[workerid] + worker_exp_len[workerid] + length;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* This is a combined worker, the expected end is the end for the latest worker */
			
 
				 		int worker_size;
			
 
				 		int *combined_workerid;
			
@@ -161,7 +180,7 @@ static double compute_expected_end(int workerid, double length, int nworkers)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static double compute_ntasks_end(int workerid, int nworkers)
			
 
				+static double compute_ntasks_end(int workerid)
			
 
				 {
			
 
				 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				 	if (workerid < (int)nworkers)
			
@@ -169,7 +188,8 @@ static double compute_ntasks_end(int workerid, int nworkers)
 
				 		/* This is a basic worker */
			
 
				 		return ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* This is a combined worker, the expected end is the end for the latest worker */
			
 
				 		int worker_size;
			
 
				 		int *combined_workerid;
			
@@ -181,7 +201,7 @@ static double compute_ntasks_end(int workerid, int nworkers)
 
				 		for (i = 0; i < worker_size; i++)
			
 
				 		{
			
 
				 			/* XXX: this is actually bogus: not all pushed tasks are necessarily parallel... */
			
 
				-			ntasks_end = STARPU_MAX(ntasks_end, ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch));
			
 
				+			ntasks_end = STARPU_MAX(ntasks_end, (int) ((double) ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch)));
			
 
				 		}
			
 
				 
			
 
				 		return ntasks_end;
			
@@ -199,22 +219,22 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 	
			
 
				 	/* this flag is set if the corresponding worker is selected because
			
 
				 	   there is no performance prediction available yet */
			
 
				-	int forced_best = -1, forced_best_ctx = -1;
			
 
				+	int forced_best = -1, forced_best_ctx = -1, forced_nimpl = -1;
			
 
				 
			
 
				-	double local_task_length[nworkers_ctx + ncombinedworkers];
			
 
				-	double local_data_penalty[nworkers_ctx + ncombinedworkers];
			
 
				-	double local_power[nworkers_ctx + ncombinedworkers];
			
 
				-	double local_exp_end[nworkers_ctx + ncombinedworkers];
			
 
				-	double fitness[nworkers_ctx + ncombinedworkers];
			
 
				+	double local_task_length[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_data_penalty[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_power[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double local_exp_end[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				+	double fitness[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	double max_exp_end = 0.0;
			
 
				 
			
 
				-	int skip_worker[nworkers_ctx + ncombinedworkers];
			
 
				+	int skip_worker[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				 	double best_exp_end = DBL_MAX;
			
 
				 	//double penality_best = 0.0;
			
 
				 
			
 
				-	int ntasks_best = -1, ntasks_best_ctx = -1;
			
 
				+	int ntasks_best = -1, ntasks_best_ctx = -1, nimpl_best = -1;
			
 
				 	double ntasks_best_end = 0.0;
			
 
				 	int calibrating = 0;
			
 
				 
			
@@ -232,48 +252,51 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 	}
			
 
				 
			
 
				 	unsigned nimpl;
			
 
				-	unsigned best_impl = 0;
			
 
				 	for (worker_ctx = 0; worker_ctx < (nworkers_ctx + ncombinedworkers); worker_ctx++)
			
 
				  	{
			
 
				 		worker = sched_ctx->workerids[worker_ctx];
			
 
				 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
 
				-			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
			
 
				+			if (!starpu_combined_worker_can_execute_task(worker, task, nimpl))
			
 
				 			{
			
 
				 				/* no one on that queue may execute this task */
			
 
				-				skip_worker[worker] = 1;
			
 
				+				skip_worker[worker][nimpl] = 1;
			
 
				 				continue;
			
 
				 			}
			
 
				-			else {
			
 
				-				skip_worker[worker] = 0;
			
 
				+			else
			
 
				+			{
			
 
				+				skip_worker[worker][nimpl] = 0;
			
 
				 			}
			
 
				 
			
 
				 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 
			
 
				-			local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				+			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				 
			
 
				 			unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				-			local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				+			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
			
 
				 
			
 
				-			double ntasks_end = compute_ntasks_end(worker, nworkers_ctx);
			
 
				+			double ntasks_end = compute_ntasks_end(worker);
			
 
				 
			
 
				 			if (ntasks_best == -1
			
 
				-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				-					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
			
 
				-					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				-					) {
			
 
				+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
			
 
				+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
			
 
				+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
			
 
				+					)
			
 
				+			{
			
 
				 				ntasks_best_end = ntasks_end;
			
 
				 				ntasks_best = worker;
			
 
				 				ntasks_best_ctx = worker_ctx;
			
 
				+				nimpl_best = nimpl;
			
 
				 			}
			
 
				 
			
 
				-			if (local_task_length[worker_ctx] == -1.0)
			
 
				+			if (isnan(local_task_length[worker_ctx][nimpl]))
			
 
				 				/* we are calibrating, we want to speed-up calibration time
			
 
				 				 * so we privilege non-calibrated tasks (but still
			
 
				 				 * greedily distribute them to avoid dumb schedules) */
			
 
				 				calibrating = 1;
			
 
				 
			
 
				-			if (local_task_length[worker_ctx] <= 0.0)
			
 
				+			if (isnan(local_task_length[worker_ctx][nimpl])
			
 
				+					|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
			
 
				 				/* there is no prediction available for that task
			
 
				 				 * with that arch yet, so switch to a greedy strategy */
			
 
				 				unknown = 1;
			
@@ -281,23 +304,23 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 			if (unknown)
			
 
				 				continue;
			
 
				 
			
 
				-			local_exp_end[worker_ctx] = compute_expected_end(worker, local_task_length[worker], nworkers_ctx);
			
 
				+			local_exp_end[worker_ctx][nimpl] = compute_expected_end(worker, local_task_length[worker_ctx][nimpl]);
			
 
				 
			
 
				-			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
			
 
				+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker_ctx][nimpl], local_exp_end[worker][nimpl]);
			
 
				 
			
 
				-			if (local_exp_end[worker_ctx] < best_exp_end)
			
 
				+			if (local_exp_end[worker_ctx][nimpl] < best_exp_end)
			
 
				 			{
			
 
				 				/* a better solution was found */
			
 
				-				best_exp_end = local_exp_end[worker_ctx];
			
 
				-				best_impl = nimpl;
			
 
				+				best_exp_end = local_exp_end[worker_ctx][nimpl];
			
 
				+				nimpl_best = nimpl;
			
 
				 			}
			
 
				 
			
 
				 
			
 
				-			local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch,nimpl);
			
 
				+			local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
			
 
				 			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
			
 
				 
			
 
				-			if (local_power[worker_ctx] == -1.0)
			
 
				-				local_power[worker_ctx] = 0.;
			
 
				+			if (isnan(local_power[worker_ctx][nimpl]))
			
 
				+				local_power[worker_ctx][nimpl] = 0.;
			
 
				 
			
 
				 		} //end for
			
 
				 	}
			
@@ -306,9 +329,9 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 	{
			
 
				 		forced_best = ntasks_best;
			
 
				 		forced_best_ctx = ntasks_best_ctx;
			
 
				+		forced_nimpl = nimpl_best;
			
 
				 	}
			
 
				 
			
 
				-
			
 
				 	double best_fitness = -1;
			
 
				 
			
 
				 
			
@@ -320,32 +343,35 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 			worker = worker_ctx >= nworkers_ctx ? worker_ctx : 
			
 
				 				sched_ctx->workerids[worker_ctx];
			
 
				 
			
 
				-			if (skip_worker[worker_ctx])
			
 
				+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 			{
			
 
				-				/* no one on that queue may execute this task */
			
 
				-				continue;
			
 
				+				if (skip_worker[worker_ctx][nimpl])
			
 
				+				{
			
 
				+					/* no one on that queue may execute this task */
			
 
				+					continue;
			
 
				+				}
			
 
				+
			
 
				+				fitness[worker_ctx][nimpl] = hd->alpha*(local_exp_end[worker_ctx][nimpl] - best_exp_end) 
			
 
				+						+ hd->beta*(local_data_penalty[worker_ctx][nimpl])
			
 
				+						+ hd->_gamma*(local_power[worker_ctx][nimpl]);
			
 
				+
			
 
				+				if (local_exp_end[worker_ctx][nimpl] > max_exp_end)
			
 
				+					/* This placement will make the computation
			
 
				+					 * longer, take into account the idle
			
 
				+					 * consumption of other cpus */
			
 
				+					fitness[worker_ctx][nimpl] += hd->_gamma * hd->idle_power * (local_exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
			
 
				+
			
 
				+				if (best == -1 || fitness[worker_ctx] < best_fitness)
			
 
				+				{
			
 
				+					/* we found a better solution */
			
 
				+					best_fitness = fitness[worker_ctx][nimpl];
			
 
				+					best = worker;
			
 
				+					best_id_ctx = worker_ctx;
			
 
				+					nimpl_best = nimpl;
			
 
				+				}
			
 
				+
			
 
				+			//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker][nimpl], local_exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl]);
			
 
				 			}
			
 
				-	
			
 
				-			fitness[worker_ctx] = hd->alpha*(local_exp_end[worker_ctx] - best_exp_end) 
			
 
				-					+ hd->beta*(local_data_penalty[worker_ctx])
			
 
				-					+ hd->_gamma*(local_power[worker_ctx]);
			
 
				-
			
 
				-			if (local_exp_end[worker_ctx] > max_exp_end)
			
 
				-				/* This placement will make the computation
			
 
				-				 * longer, take into account the idle
			
 
				-				 * consumption of other cpus */
			
 
				-				fitness[worker_ctx] += hd->_gamma * hd->idle_power * (local_exp_end[worker_ctx] - max_exp_end) / 1000000.0;
			
 
				-
			
 
				-			if (best == -1 || fitness[worker_ctx] < best_fitness)
			
 
				-			{
			
 
				-				/* we found a better solution */
			
 
				-				best_fitness = fitness[worker_ctx];
			
 
				-				best = worker;
			
 
				-				best_id_ctx = worker_ctx;
			
 
				-			}
			
 
				-
			
 
				-		//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker], local_exp_end[worker] - best_exp_end, local_data_penalty[worker]);
			
 
				-		}
			
 
				 	}
			
 
				 
			
 
				 	STARPU_ASSERT(forced_best != -1 || best != -1);
			
@@ -357,18 +383,19 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 		 * so we force this measurement */
			
 
				 		best = forced_best;
			
 
				 		best_id_ctx = forced_best_ctx;
			
 
				+		nimpl_best = forced_nimpl;
			
 
				 		//penality_best = 0.0;
			
 
				-		best_exp_end = local_exp_end[best_id_ctx];
			
 
				+		best_exp_end = compute_expected_end(best, 0);
			
 
				 	}
			
 
				-	else 
			
 
				+	else
			
 
				 	{
			
 
				-                //penality_best = local_data_penalty[best];
			
 
				-		best_exp_end = local_exp_end[best_id_ctx];
			
 
				+		//penality_best = local_data_penalty[best_id_ctx][nimpl_best];
			
 
				+		best_exp_end = local_exp_end[best_id_ctx][nimpl_best];
			
 
				 	}
			
 
				 
			
 
				 
			
 
				-	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", best_impl);
			
 
				-	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
			
 
				+	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", nimpl_best);
			
 
				+	_starpu_get_job_associated_to_task(task)->nimpl = nimpl_best;
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				 	return push_task_on_best_worker(task, best, best_exp_end, prio, sched_ctx);
			
 
				 }
			
@@ -452,11 +479,11 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
				 			worker_exp_end[workerid] = worker_exp_start[workerid]; 
			
 
				 			ntasks[workerid] = 0;
			
 
				 		}
			
 
				-		PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
			
 
				-		PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
			
 
				+		_STARPU_PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
			
 
				+		_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
			
 
				 	}
			
 
				 
			
 
				-	PTHREAD_MUTEX_INIT(&big_lock, NULL);
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&big_lock, NULL);
			
 
				 
			
 
				 	/* We pre-compute an array of all the perfmodel archs that are applicable */
			
 
				 	unsigned total_worker_count = nworkers_ctx + ncombinedworkers;
			
@@ -500,12 +527,14 @@ static void parallel_heft_deinit(unsigned sched_ctx_id)
 
				 }
			
 
				 
			
 
				 /* TODO: use post_exec_hook to fix the expected start */
			
 
				-struct starpu_sched_policy_s _starpu_sched_parallel_heft_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_parallel_heft_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_parallel_heft_policy,
			
 
				 	.init_sched_for_workers = parallel_heft_init_for_workers,
			
 
				 	.deinit_sched = parallel_heft_deinit,
			
 
				 	.push_task = parallel_heft_push_task, 
			
 
				 	.pop_task = NULL,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = parallel_heft_post_exec_hook,
			
 
				 	.pop_every_task = NULL,
			
 
				 	.policy_name = "pheft",
			
--- a/src/sched_policies/random_policy.c
+++ b/src/sched_policies/random_policy.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -52,7 +52,8 @@ static int _random_push_task(struct starpu_task *task, unsigned prio, struct sta
 
				 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
			
 
				 		double worker_alpha = starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				-		if (alpha + worker_alpha > random) {
			
 
				+		if (alpha + worker_alpha > random && starpu_worker_can_execute_task(worker, task, 0))
			
 
				+		{
			
 
				 			/* we found the worker */
			
 
				 			selected = worker;
			
 
				 			break;
			
@@ -72,7 +73,7 @@ static int random_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 
			
 
				-    return _random_push_task(task, 0, sched_ctx);
			
 
				+    return _random_push_task(task, !!task->priority, sched_ctx);
			
 
				 }
			
 
				 
			
 
				 static void initialize_random_policy_for_workers(unsigned sched_ctx_id, int *workerids, unsigned nnew_workers) 
			
@@ -109,12 +110,14 @@ static void initialize_random_policy(unsigned sched_ctx_id)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-struct starpu_sched_policy_s _starpu_sched_random_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_random_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_random_policy,
			
 
				 	.init_sched_for_workers = initialize_random_policy_for_workers,
			
 
				 	.deinit_sched = NULL,
			
 
				 	.push_task = random_push_task,
			
 
				 	.pop_task = NULL,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
 
				 	.policy_name = "random",
			
--- a/src/sched_policies/stack_queues.c
+++ b/src/sched_policies/stack_queues.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,7 @@
 
				 #include <errno.h>
			
 
				 #include <common/utils.h>
			
 
				 
			
 
				-/* keep track of the total number of jobs to be scheduled to avoid infinite 
			
 
				+/* keep track of the total number of jobs to be scheduled to avoid infinite
			
 
				  * polling when there are really few jobs in the overall queue */
			
 
				 static unsigned total_number_of_jobs;
			
 
				 
			
@@ -31,12 +31,12 @@ void _starpu_init_stack_queues_mechanisms(void)
 
				 	total_number_of_jobs = 0;
			
 
				 }
			
 
				 
			
 
				-struct starpu_stack_jobq_s *_starpu_create_stack(void)
			
 
				+struct _starpu_stack_jobq *_starpu_create_stack(void)
			
 
				 {
			
 
				-	struct starpu_stack_jobq_s *stack;
			
 
				-	stack = (struct starpu_stack_jobq_s *) malloc(sizeof(struct starpu_stack_jobq_s));
			
 
				+	struct _starpu_stack_jobq *stack;
			
 
				+	stack = (struct _starpu_stack_jobq *) malloc(sizeof(struct _starpu_stack_jobq));
			
 
				 
			
 
				-	stack->jobq = starpu_job_list_new();
			
 
				+	stack->jobq = _starpu_job_list_new();
			
 
				 	stack->njobs = 0;
			
 
				 	stack->nprocessed = 0;
			
 
				 
			
@@ -47,58 +47,58 @@ struct starpu_stack_jobq_s *_starpu_create_stack(void)
 
				 	return stack;
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_get_stack_njobs(struct starpu_stack_jobq_s *stack_queue)
			
 
				+unsigned _starpu_get_stack_njobs(struct _starpu_stack_jobq *stack_queue)
			
 
				 {
			
 
				 	return stack_queue->njobs;
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_get_stack_nprocessed(struct starpu_stack_jobq_s *stack_queue)
			
 
				+unsigned _starpu_get_stack_nprocessed(struct _starpu_stack_jobq *stack_queue)
			
 
				 {
			
 
				 	return stack_queue->nprocessed;
			
 
				 }
			
 
				 
			
 
				-void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task)
			
 
				+void _starpu_stack_push_task(struct _starpu_stack_jobq *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct _starpu_job *task)
			
 
				 {
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 	total_number_of_jobs++;
			
 
				 
			
 
				-	STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				+	_STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				 	if (task->task->priority)
			
 
				-		starpu_job_list_push_back(stack_queue->jobq, task);
			
 
				+		_starpu_job_list_push_back(stack_queue->jobq, task);
			
 
				 	else
			
 
				-		starpu_job_list_push_front(stack_queue->jobq, task);
			
 
				+		_starpu_job_list_push_front(stack_queue->jobq, task);
			
 
				 	stack_queue->njobs++;
			
 
				 	stack_queue->nprocessed++;
			
 
				 
			
 
				-	PTHREAD_COND_SIGNAL(sched_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 }
			
 
				 
			
 
				-starpu_job_t _starpu_stack_pop_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, int workerid __attribute__ ((unused)))
			
 
				+struct _starpu_job *_starpu_stack_pop_task(struct _starpu_stack_jobq *stack_queue, pthread_mutex_t *sched_mutex, int workerid __attribute__ ((unused)))
			
 
				 {
			
 
				-	starpu_job_t j = NULL;
			
 
				+	struct _starpu_job *j = NULL;
			
 
				 
			
 
				 	if (stack_queue->njobs == 0)
			
 
				 		return NULL;
			
 
				 
			
 
				 	/* TODO find a task that suits workerid */
			
 
				-	if (stack_queue->njobs > 0) 
			
 
				+	if (stack_queue->njobs > 0)
			
 
				 	{
			
 
				 		/* there is a task */
			
 
				-		j = starpu_job_list_pop_back(stack_queue->jobq);
			
 
				-	
			
 
				+		j = _starpu_job_list_pop_back(stack_queue->jobq);
			
 
				+
			
 
				 		STARPU_ASSERT(j);
			
 
				 		stack_queue->njobs--;
			
 
				-		
			
 
				-		STARPU_TRACE_JOB_POP(j, 0);
			
 
				 
			
 
				-		/* we are sure that we got it now, so at worst, some people thought 
			
 
				+		_STARPU_TRACE_JOB_POP(j, 0);
			
 
				+
			
 
				+		/* we are sure that we got it now, so at worst, some people thought
			
 
				 		 * there remained some work and will soon discover it is not true */
			
 
				-		PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 		total_number_of_jobs--;
			
 
				-		PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	return j;
			
 
				 
			
 
				 }
			
--- a/src/sched_policies/stack_queues.h
+++ b/src/sched_policies/stack_queues.h
@@ -23,9 +23,10 @@
 
				 #include <common/config.h>
			
 
				 #include <core/jobs.h>
			
 
				 
			
 
				-struct starpu_stack_jobq_s {
			
 
				+struct _starpu_stack_jobq
			
 
				+{
			
 
				 	/* the actual list */
			
 
				-	starpu_job_list_t jobq;
			
 
				+	struct _starpu_job_list *jobq;
			
 
				 
			
 
				 	/* the number of tasks currently in the queue */
			
 
				 	unsigned njobs;
			
@@ -39,17 +40,17 @@ struct starpu_stack_jobq_s {
 
				 	double exp_len; /* Expected duration of the set of tasks in the queue */
			
 
				 };
			
 
				 
			
 
				-struct starpu_stack_jobq_s *_starpu_create_stack(void);
			
 
				+struct _starpu_stack_jobq *_starpu_create_stack(void);
			
 
				 
			
 
				-void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task);
			
 
				+void _starpu_stack_push_task(struct _starpu_stack_jobq *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct _starpu_job *task);
			
 
				 
			
 
				-starpu_job_t _starpu_stack_pop_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, int workerid);
			
 
				+struct _starpu_job *_starpu_stack_pop_task(struct _starpu_stack_jobq *stack, pthread_mutex_t *sched_mutex, int workerid);
			
 
				 
			
 
				 void _starpu_init_stack_queues_mechanisms(void);
			
 
				 
			
 
				 
			
 
				-unsigned _starpu_get_stack_njobs(struct starpu_stack_jobq_s *stack);
			
 
				-unsigned _starpu_get_stack_nprocessed(struct starpu_stack_jobq_s *stack);
			
 
				+unsigned _starpu_get_stack_njobs(struct _starpu_stack_jobq *stack);
			
 
				+unsigned _starpu_get_stack_nprocessed(struct _starpu_stack_jobq *stack);
			
 
				 
			
 
				 
			
 
				 #endif // __STACK_QUEUES_H__
			
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011  INRIA
			
 
				+ * Copyright (C) 2011, 2012  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,11 +18,13 @@
 
				 
			
 
				 /* Work stealing policy */
			
 
				 
			
 
				+#include <float.h>
			
 
				+
			
 
				 #include <core/workers.h>
			
 
				 #include <sched_policies/deque_queues.h>
			
 
				 
			
 
				 typedef struct work_stealing_data{
			
 
				-	struct starpu_deque_jobq_s **queue_array;
			
 
				+	struct _starpu_deque_jobq **queue_array;
			
 
				 	unsigned rr_worker;
			
 
				 	/* keep track of the work performed from the beginning of the algorithm to make
			
 
				 	 * better decisions about which queue to select when stealing or deferring work
			
@@ -30,107 +32,198 @@ typedef struct work_stealing_data{
 
				 	unsigned performed_total;
			
 
				 	pthread_mutex_t sched_mutex;
			
 
				 	pthread_cond_t sched_cond;
			
 
				+	unsigned last_pop_worker;
			
 
				+static unsigned last_push_worker;
			
 
				 } work_stealing_data;
			
 
				 
			
 
				 #ifdef USE_OVERLOAD
			
 
				-static float overload_metric(struct starpu_deque_jobq_s *dequeue_queue, unsigned *performed_total)
			
 
				+
			
 
				+/**
			
 
				+ * Minimum number of task we wait for being processed before we start assuming
			
 
				+ * on which worker the computation would be faster.
			
 
				+ */
			
 
				+static int calibration_value = 0;
			
 
				+
			
 
				+#endif /* USE_OVERLOAD */
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * Return a worker from which a task can be stolen.
			
 
				+ * Selecting a worker is done in a round-robin fashion, unless
			
 
				+ * the worker previously selected doesn't own any task,
			
 
				+ * then we return the first non-empty worker.
			
 
				+ */
			
 
				+static unsigned select_victim_round_robin(struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-	float execution_ratio = 0.0f;
			
 
				-	if (*performed_total > 0) {
			
 
				-		execution_ratio = _starpu_get_deque_nprocessed(dequeue_queue)/ *performed_total;
			
 
				+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
			
 
				+	unsigned worker = ws->last_pop_worker;
			
 
				+
			
 
				+	/* If the worker's queue is empty, let's try
			
 
				+	 * the next ones */
			
 
				+	while (!ws->queue_array[worker]->njobs)
			
 
				+	{
			
 
				+		worker = (worker + 1) % sched_ctx->nworkers;
			
 
				+		if (worker == ws->last_pop_worker)
			
 
				+		{
			
 
				+			/* We got back to the first worker,
			
 
				+			 * don't go in infinite loop */
			
 
				+			break;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	unsigned performed_queue;
			
 
				-	performed_queue = _starpu_get_deque_nprocessed(dequeue_queue);
			
 
				+	ws->last_pop_worker = (worker + 1) % sched_ctx->nworkers;
			
 
				 
			
 
				-	float current_ratio = 0.0f;
			
 
				-	if (performed_queue > 0) {
			
 
				-		current_ratio = _starpu_get_deque_njobs(dequeue_queue)/performed_queue;
			
 
				-	}
			
 
				-	
			
 
				-	return (current_ratio - execution_ratio);
			
 
				+	return worker;
			
 
				 }
			
 
				 
			
 
				-/* who to steal work to ? */
			
 
				-static struct starpu_deque_jobq_s *select_victimq(work_stealing_data *ws, unsigned nworkers)
			
 
				+/**
			
 
				+ * Return a worker to whom add a task.
			
 
				+ * Selecting a worker is done in a round-robin fashion.
			
 
				+ */
			
 
				+static unsigned select_worker_round_robin(struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-	struct starpu_deque_jobq_s *q;
			
 
				+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
			
 
				+	unsigned worker = ws->last_push_worker;
			
 
				 
			
 
				-	unsigned attempts = nworkers;
			
 
				+	last_push_worker = (last_push_worker + 1) % sched_ctx->nworkers;
			
 
				 
			
 
				-	unsigned worker = ws->rr_worker;
			
 
				-	do {
			
 
				-		if (overload_metric(worker) > 0.0f)
			
 
				-		{
			
 
				-			q = ws->queue_array[worker];
			
 
				-			return q;
			
 
				-		}
			
 
				-		else {
			
 
				-			worker = (worker + 1)%nworkers;
			
 
				-		}
			
 
				-	} while(attempts-- > 0);
			
 
				+	return worker;
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_OVERLOAD
			
 
				+
			
 
				+/**
			
 
				+ * Return a ratio helpful to determine whether a worker is suitable to steal
			
 
				+ * tasks from or to put some tasks in its queue.
			
 
				+ *
			
 
				+ * \return	a ratio with a positive or negative value, describing the current state of the worker :
			
 
				+ * 		a smaller value implies a faster worker with an relatively emptier queue : more suitable to put tasks in
			
 
				+ * 		a bigger value implies a slower worker with an reletively more replete queue : more suitable to steal tasks from
			
 
				+ */
			
 
				+static float overload_metric(struct starpu_sched_ctx *sched_ctx, unsigned id)
			
 
				+{
			
 
				+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
			
 
				+	float execution_ratio = 0.0f;
			
 
				+	float current_ratio = 0.0f;
			
 
				+
			
 
				+	int nprocessed = _starpu_get_deque_nprocessed(ws->queue_array[id]);
			
 
				+	unsigned njobs = _starpu_get_deque_njobs(ws->queue_array[id]);
			
 
				 
			
 
				-	/* take one anyway ... */
			
 
				-	q = ws->queue_array[ws->rr_worker];
			
 
				-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
			
 
				+	/* Did we get enough information ? */
			
 
				+	if (performed_total > 0 && nprocessed > 0)
			
 
				+	{
			
 
				+		/* How fast or slow is the worker compared to the other workers */
			
 
				+		execution_ratio = (float) nprocessed / performed_total;
			
 
				+		/* How replete is its queue */
			
 
				+		current_ratio = (float) njobs / nprocessed;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		return 0.0f;
			
 
				+	}
			
 
				 
			
 
				-	return q;
			
 
				+	return (current_ratio - execution_ratio);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_deque_jobq_s *select_workerq(work_stealing_data *ws, unsigned nworkers)
			
 
				+/**
			
 
				+ * Return the most suitable worker from which a task can be stolen.
			
 
				+ * The number of previously processed tasks, total and local,
			
 
				+ * and the number of tasks currently awaiting to be processed
			
 
				+ * by the tasks are taken into account to select the most suitable
			
 
				+ * worker to steal task from.
			
 
				+ */
			
 
				+static unsigned select_victim_overload(struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-	struct starpu_deque_jobq_s *q;
			
 
				+	unsigned worker, worker_ctx;
			
 
				+	float  worker_ratio;
			
 
				+	unsigned best_worker = 0;
			
 
				+	float best_ratio = FLT_MIN;	
			
 
				 
			
 
				-	unsigned attempts = nworkers;
			
 
				+	/* Don't try to play smart until we get
			
 
				+	 * enough informations. */
			
 
				+	if (performed_total < calibration_value)
			
 
				+		return select_victim_round_robin(sched_ctx);
			
 
				 
			
 
				-	unsigned worker = ws->rr_worker;
			
 
				-	do {
			
 
				-		if (overload_metric(worker) < 0.0f)
			
 
				+	for (worker_ctx = 0; worker_ctx < sched_ctx->nworkers; worker_ctx++)
			
 
				+	{
			
 
				+		worker = sched_ctx->workerid[worker_ctx];
			
 
				+		worker_ratio = overload_metric(worker);
			
 
				+
			
 
				+		if (worker_ratio > best_ratio)
			
 
				 		{
			
 
				-			q = ws->queue_array[worker];
			
 
				-			return q;
			
 
				-		}
			
 
				-		else {
			
 
				-			worker = (worker + 1)%nworkers;
			
 
				+			best_worker = worker;
			
 
				+			best_ratio = worker_ratio;
			
 
				 		}
			
 
				-	} while(attempts-- > 0);
			
 
				-
			
 
				-	/* take one anyway ... */
			
 
				-	q = ws->queue_array[ws->rr_worker];
			
 
				-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
			
 
				+	}
			
 
				 
			
 
				-	return q;
			
 
				+	return best_worker;
			
 
				 }
			
 
				 
			
 
				-#else
			
 
				-
			
 
				-/* who to steal work to ? */
			
 
				-static struct starpu_deque_jobq_s *select_victimq(work_stealing_data *ws, unsigned nworkers)
			
 
				+/**
			
 
				+ * Return the most suitable worker to whom add a task.
			
 
				+ * The number of previously processed tasks, total and local,
			
 
				+ * and the number of tasks currently awaiting to be processed
			
 
				+ * by the tasks are taken into account to select the most suitable
			
 
				+ * worker to add a task to.
			
 
				+ */
			
 
				+static unsigned select_worker_overload(struct starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				-	struct starpu_deque_jobq_s *q;
			
 
				+	unsigned worker, worker_ctx;
			
 
				+	float  worker_ratio;
			
 
				+	unsigned best_worker = 0;
			
 
				+	float best_ratio = FLT_MAX;
			
 
				 
			
 
				-	q = ws->queue_array[ws->rr_worker];
			
 
				+	/* Don't try to play smart until we get
			
 
				+	 * enough informations. */
			
 
				+	if (performed_total < calibration_value)
			
 
				+		return select_worker_round_robin(sched_ctx);
			
 
				 
			
 
				-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
			
 
				+	for (worker_ctx = 0; worker_ctx < sched_ctx->nworkers; worker_ctx++)
			
 
				+	{
			
 
				+		worker = sched_ctx->workerid[worker_ctx];
			
 
				+		worker_ratio = overload_metric(sched_ctx,  worker);
			
 
				 
			
 
				-	return q;
			
 
				-}
			
 
				+		if (worker_ratio < best_ratio)
			
 
				+		{
			
 
				+			best_worker = worker;
			
 
				+			best_ratio = worker_ratio;
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				+	return best_worker;
			
 
				+}
			
 
				 
			
 
				-/* when anonymous threads submit tasks, 
			
 
				- * we need to select a queue where to dispose them */
			
 
				-static struct starpu_deque_jobq_s *select_workerq(work_stealing_data *ws, unsigned nworkers)
			
 
				-{
			
 
				-	struct starpu_deque_jobq_s *q;
			
 
				+#endif /* USE_OVERLOAD */
			
 
				 
			
 
				-	q = ws->queue_array[ws->rr_worker];
			
 
				 
			
 
				-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
			
 
				+/**
			
 
				+ * Return a worker from which a task can be stolen.
			
 
				+ * This is a phony function used to call the right
			
 
				+ * function depending on the value of USE_OVERLOAD.
			
 
				+ */
			
 
				+static inline unsigned select_victim(struct starpu_sched_ctx *sched_ctx)
			
 
				+{
			
 
				+#ifdef USE_OVERLOAD
			
 
				+	return select_victim_overload(sched_ctx);
			
 
				+#else
			
 
				+	return select_victim_round_robin(sched_ctx);
			
 
				+#endif /* USE_OVERLOAD */
			
 
				+}
			
 
				 
			
 
				-	return q;
			
 
				+/**
			
 
				+ * Return a worker from which a task can be stolen.
			
 
				+ * This is a phony function used to call the right
			
 
				+ * function depending on the value of USE_OVERLOAD.
			
 
				+ */
			
 
				+static inline unsigned select_worker(struct starpu_sched_ctx *sched_ctx)
			
 
				+{
			
 
				+#ifdef USE_OVERLOAD
			
 
				+	return select_worker_overload(sched_ctx);
			
 
				+#else
			
 
				+	return select_worker_round_robin(sched_ctx);
			
 
				+#endif /* USE_OVERLOAD */
			
 
				 }
			
 
				 
			
 
				-#endif
			
 
				 
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning TODO rewrite ... this will not scale at all now
			
@@ -141,64 +234,73 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
				 	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
			
 
				 
			
 
				 	struct starpu_task *task;
			
 
				+	struct _starpu_deque_jobq *q;
			
 
				 
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 
			
 
				-	struct starpu_deque_jobq_s *q;
			
 
				+	STARPU_ASSERT(workerid != -1);
			
 
				 
			
 
				 	q = ws->queue_array[workerid];
			
 
				 
			
 
				 	PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
			
 
				 
			
 
				-	task = _starpu_deque_pop_task(q, -1);
			
 
				-	if (task) {
			
 
				+	task = _starpu_deque_pop_task(q, workerid);
			
 
				+	if (task)
			
 
				+	{
			
 
				 		/* there was a local task */
			
 
				 		ws->performed_total++;
			
 
				 		PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
			
 
				+		q->nprocessed++;
			
 
				+		q->njobs--;
			
 
				 		return task;
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	/* we need to steal someone's job */
			
 
				-	struct starpu_deque_jobq_s *victimq;
			
 
				-	victimq = select_victimq(ws, sched_ctx->nworkers);
			
 
				+	unsigned victim = select_victim(sched_ctx);
			
 
				+	struct _starpu_deque_jobq *victimq = ws->queue_array[victim];
			
 
				 
			
 
				 	task = _starpu_deque_pop_task(victimq, workerid);
			
 
				-	if (task) {
			
 
				-		STARPU_TRACE_WORK_STEALING(q, victimq);
			
 
				+	if (task)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORK_STEALING(q, workerid);
			
 
				 		ws->performed_total++;
			
 
				-	}
			
 
				 
			
 
				-	PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
			
 
				+		/* Beware : we have to increase the number of processed tasks of
			
 
				+		 * the stealer, not the victim ! */
			
 
				+		q->nprocessed++;
			
 
				+		victimq->njobs--;
			
 
				+	}
			
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				 int ws_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 {
			
 
				-	starpu_job_t j = _starpu_get_job_associated_to_task(task);
			
 
				-
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
			
 
				 
			
 
				+	struct _starpu_deque_jobq *deque_queue;
			
 
				+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task); 
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
			
 
				 
			
 
				-        struct starpu_deque_jobq_s *deque_queue;
			
 
				-	deque_queue = ws->queue_array[workerid];
			
 
				+	/* If the current thread is not a worker but
			
 
				+	 * the main thread (-1), we find the better one to
			
 
				+	 * put task on its queue */
			
 
				+	if (workerid == -1)
			
 
				+		workerid = select_worker(sched_ctx);
			
 
				 
			
 
				-        PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
			
 
				-	// XXX reuse ?
			
 
				-        //total_number_of_jobs++;
			
 
				+	deque_queue = ws->queue_array[workerid];
			
 
				 
			
 
				-        STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				-        starpu_job_list_push_front(deque_queue->jobq, j);
			
 
				-        deque_queue->njobs++;
			
 
				-        deque_queue->nprocessed++;
			
 
				+	_STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				+	_starpu_job_list_push_back(deque_queue->jobq, j);
			
 
				+	deque_queue->njobs++;
			
 
				 
			
 
				-        PTHREAD_COND_SIGNAL(&ws->sched_cond);
			
 
				-        PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
			
 
				+	_STARPU_PTHREAD_COND_SIGNAL(&ws->sched_cond);
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
			
 
				 
			
 
				-        return 0;
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static void initialize_ws_policy_for_workers(unsigned sched_ctx_id, int *workerids,unsigned nnew_workers) 
			
@@ -213,6 +315,12 @@ static void initialize_ws_policy_for_workers(unsigned sched_ctx_id, int *workeri
 
				 	{
			
 
				 		workerid = workerids[i];
			
 
				 		ws->queue_array[workerid] = _starpu_create_deque();
			
 
				+		/**
			
 
				+		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
			
 
				+		 * we need to initialize it at -1.
			
 
				+		 */
			
 
				+		ws->queue_array[workerid]->nprocessed = -1;
			
 
				+		ws->queue_array[workerid]->njobs = 0;
			
 
				 
			
 
				 		sched_ctx->sched_mutex[workerid] = &ws->sched_mutex;
			
 
				 		sched_ctx->sched_cond[workerid] = &ws->sched_cond;
			
@@ -226,11 +334,19 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 
				 	sched_ctx->policy_data = (void*)ws;
			
 
				 	
			
 
				 	unsigned nworkers = sched_ctx->nworkers;
			
 
				-	ws->rr_worker = 0;
			
 
				-	ws->queue_array = (struct starpu_deque_jobq_s**)malloc(STARPU_NMAXWORKERS*sizeof(struct starpu_deque_jobq_s*));
			
 
				+	ws->last_pop_worker = 0;
			
 
				+	ws->last_push_worker = 0;
			
 
				+
			
 
				+	/**
			
 
				+	 * The first WS_POP_TASK will increase PERFORMED_TOTAL though no task was actually performed yet,
			
 
				+	 * we need to initialize it at -1.
			
 
				+	 */
			
 
				+	ws->performed_total = -1;
			
 
				 
			
 
				-	PTHREAD_MUTEX_INIT(&ws->sched_mutex, NULL);
			
 
				-	PTHREAD_COND_INIT(&ws->sched_cond, NULL);
			
 
				+	ws->queue_array = (struct starpu_deque_jobq_s**)malloc(STARPU_NMAXWORKERS*sizeof(struct _starpu_deque_jobq*));
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_INIT(&ws->sched_mutex, NULL);
			
 
				+	_STARPU_PTHREAD_COND_INIT(&ws->sched_cond, NULL);
			
 
				 
			
 
				 	unsigned workerid_ctx;
			
 
				 	int workerid;
			
@@ -238,9 +354,21 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 
				 	{
			
 
				 		workerid = sched_ctx->workerids[workerid_ctx];
			
 
				 		ws->queue_array[workerid] = _starpu_create_deque();
			
 
				+		/**
			
 
				+		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
			
 
				+		 * we need to initialize it at -1.
			
 
				+		 */
			
 
				+		ws->queue_array[workerid]->nprocessed = -1;
			
 
				+		ws->queue_array[workerid]->njobs = 0;
			
 
				 
			
 
				 		sched_ctx->sched_mutex[workerid] = &ws->sched_mutex;
			
 
				 		sched_ctx->sched_cond[workerid] = &ws->sched_cond;
			
 
				+
			
 
				+#ifdef USE_OVERLOAD
			
 
				+		enum starpu_perf_archtype perf_arch;
			
 
				+		perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				+		calibration_value += (unsigned int) starpu_worker_get_relative_speedup(perf_arch);
			
 
				+#endif /* USE_OVERLOAD */
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -266,11 +394,13 @@ static void deinit_ws_policy(unsigned sched_ctx_id)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-struct starpu_sched_policy_s _starpu_sched_ws_policy = {
			
 
				+struct starpu_sched_policy _starpu_sched_ws_policy =
			
 
				+{
			
 
				 	.init_sched = initialize_ws_policy,
			
 
				 	.deinit_sched = deinit_ws_policy,
			
 
				 	.push_task = ws_push_task,
			
 
				 	.pop_task = ws_pop_task,
			
 
				+	.pre_exec_hook = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
 
				 	.policy_name = "ws",