13 years ago · d3db74a216
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  * Copyright (C) 2011  INRIA
															
 
																  *
															
@@ -26,11 +26,6 @@
 
																 #include <core/perfmodel/perfmodel.h>
															
 
																 #include <starpu_parameters.h>
															
 
																-/* #ifdef STARPU_VERBOSE */
															
 
																-/* static long int total_task_cnt = 0; */
															
 
																-/* static long int ready_task_cnt = 0; */
															
 
																-/* #endif */
															
 
																-
															
 
																 typedef struct {
															
 
																 	double alpha;
															
 
																 	double beta;
															
@@ -47,19 +42,15 @@ typedef struct {
 
																 static int count_non_ready_buffers(struct starpu_task *task, uint32_t node)
															
 
																 {
															
 
																 	int cnt = 0;
															
 
																-
															
 
																-	starpu_buffer_descr *descrs = task->buffers;
															
 
																 	unsigned nbuffers = task->cl->nbuffers;
															
 
																-
															
 
																 	unsigned index;
															
 
																+
															
 
																 	for (index = 0; index < nbuffers; index++)
															
 
																 	{
															
 
																-		starpu_buffer_descr *descr;
															
 
																-		starpu_data_handle handle;
															
 
																+		starpu_data_handle_t handle;
															
 
																+
															
 
																+		handle = task->handles[index];
															
 
																-		descr = &descrs[index];
															
 
																-		handle = descr->handle;
															
 
																-		
															
 
																 		int is_valid;
															
 
																 		starpu_data_query_status(handle, node, NULL, &is_valid, NULL);
															
@@ -70,14 +61,14 @@ static int count_non_ready_buffers(struct starpu_task *task, uint32_t node)
 
																 	return cnt;
															
 
																 }
															
 
																-static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct starpu_fifo_taskq_s *fifo_queue, unsigned node)
															
 
																+static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned node)
															
 
																 {
															
 
																 	struct starpu_task *task = NULL, *current;
															
 
																 	if (fifo_queue->ntasks == 0)
															
 
																 		return NULL;
															
 
																-	if (fifo_queue->ntasks > 0) 
															
 
																+	if (fifo_queue->ntasks > 0)
															
 
																 	{
															
 
																 		fifo_queue->ntasks--;
															
@@ -108,12 +99,12 @@ static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct starpu_fifo_
 
																 			current = current->prev;
															
 
																 		}
															
 
																-		
															
 
																+
															
 
																 		starpu_task_list_erase(&fifo_queue->taskq, task);
															
 
																-		STARPU_TRACE_JOB_POP(task, 0);
															
 
																+		_STARPU_TRACE_JOB_POP(task, 0);
															
 
																 	}
															
 
																-	
															
 
																+
															
 
																 	return task;
															
 
																 }
															
@@ -124,15 +115,15 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 
																 	struct starpu_task *task;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																-
															
 
																-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
															
 
																+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
															
 
																 	unsigned node = starpu_worker_get_memory_node(workerid);
															
 
																 	task = _starpu_fifo_pop_first_ready_task(fifo, node);
															
 
																-	if (task) {
															
 
																+	if (task)
															
 
																+	{
															
 
																 		double model = task->predicted;
															
 
																-	
															
 
																+
															
 
																 		fifo->exp_len -= model;
															
 
																 		fifo->exp_start = starpu_timing_now() + model;
															
 
																 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
@@ -159,12 +150,13 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 
																 	struct starpu_task *task;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
															
 
																+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
															
 
																-	task = _starpu_fifo_pop_task(fifo, -1);
															
 
																-	if (task) {
															
 
																+	task = _starpu_fifo_pop_task(fifo, workerid);
															
 
																+	if (task)
															
 
																+	{
															
 
																 		double model = task->predicted;
															
 
																-	
															
 
																+
															
 
																 		fifo->exp_len -= model;
															
 
																 		fifo->exp_start = starpu_timing_now() + model;
															
 
																 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
@@ -193,7 +185,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
																 	struct starpu_task *new_list;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
															
 
																+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
															
 
																 	pthread_mutex_t *sched_mutex;
															
 
																 	pthread_cond_t *sched_cond;
															
@@ -207,7 +199,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
																 		fifo->exp_len -= model;
															
 
																 		fifo->exp_start = starpu_timing_now() + model;
															
 
																 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																-	
															
 
																+
															
 
																 		new_list = new_list->next;
															
 
																 	}
															
@@ -215,13 +207,13 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
																 }
															
 
																 static
															
 
																-int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
															
 
																+int _starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
															
 
																 {
															
 
																 	struct starpu_task_list *list = &fifo_queue->taskq;
															
 
																-	PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																-	STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																+	_STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																 	if (list->head == NULL)
															
 
																 	{
															
@@ -230,7 +222,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 
																 		task->prev = NULL;
															
 
																 		task->next = NULL;
															
 
																 	}
															
 
																-	else {
															
 
																+	else
															
 
																+	{
															
 
																 		struct starpu_task *current = list->head;
															
 
																 		struct starpu_task *prev = NULL;
															
@@ -251,7 +244,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 
																 			task->next = list->head;
															
 
																 			list->head = task;
															
 
																 		}
															
 
																-		else {
															
 
																+		else
															
 
																+		{
															
 
																 			if (current)
															
 
																 			{
															
 
																 				/* Insert between prev and current */
															
@@ -260,7 +254,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 
																 				task->next = current;
															
 
																 				current->prev = task;
															
 
																 			}
															
 
																-			else {
															
 
																+			else
															
 
																+			{
															
 
																 				/* Insert at the tail of the list */
															
 
																 				list->tail->next = task;
															
 
																 				task->next = NULL;
															
@@ -273,8 +268,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 
																 	fifo_queue->ntasks++;
															
 
																 	fifo_queue->nprocessed++;
															
 
																-	PTHREAD_COND_SIGNAL(sched_cond);
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 	return 0;
															
 
																 }
															
@@ -287,7 +282,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 	/* make sure someone coule execute that task ! */
															
 
																 	STARPU_ASSERT(best_workerid != -1);
															
 
																-	struct starpu_fifo_taskq_s *fifo;
															
 
																+	struct _starpu_fifo_taskq *fifo;
															
 
																 	fifo = dt->queue_array[best_workerid];
															
 
																 	fifo->exp_end += predicted;
															
@@ -295,6 +290,8 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 	task->predicted = predicted;
															
 
																+	/* TODO predicted_transfer */
															
 
																+
															
 
																 	unsigned memory_node = starpu_worker_get_memory_node(best_workerid);
															
 
																 	if (starpu_get_prefetch_flag())
															
@@ -311,11 +308,12 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 			sched_mutex, sched_cond, task);
															
 
																 }
															
 
																+/* TODO: factorize with dmda!! */
															
 
																 static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
															
 
																 {
															
 
																 	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
															
 
																 	/* find the queue */
															
 
																-	struct starpu_fifo_taskq_s *fifo;
															
 
																+	struct _starpu_fifo_taskq *fifo;
															
 
																 	unsigned worker, worker_ctx;
															
 
																 	int best = -1;
															
@@ -346,7 +344,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
																 			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
															
 
																 			fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
@@ -359,21 +357,23 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
																 			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
															
 
																 			if (ntasks_best == -1
															
 
																-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-					|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-					|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																-					) {
															
 
																+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																+			    || (!calibrating && isnan(local_length)) /* Not calibrating but this worker is being calibrated */
															
 
																+			    || (calibrating && isnan(local_length) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+				)
															
 
																+			{
															
 
																 				ntasks_best_end = ntasks_end;
															
 
																 				ntasks_best = worker;
															
 
																+				best_impl = nimpl;
															
 
																 			}
															
 
																-			if (local_length == -1.0)
															
 
																+			if (isnan(local_length))
															
 
																 				/* we are calibrating, we want to speed-up calibration time
															
 
																 				 * so we privilege non-calibrated tasks (but still
															
 
																 				 * greedily distribute them to avoid dumb schedules) */
															
 
																 				calibrating = 1;
															
 
																-			if (local_length <= 0.0)
															
 
																+			if (isnan(local_length) || _STARPU_IS_ZERO(local_length))
															
 
																 				/* there is no prediction available for that task
															
 
																 				 * with that arch yet, so switch to a greedy strategy */
															
 
																 				unknown = 1;
															
@@ -394,7 +394,8 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
																 		}
															
 
																 	}
															
 
																-	if (unknown) {
															
 
																+	if (unknown)
															
 
																+	{
															
 
																 		best = ntasks_best;
															
 
																 		model_best = 0.0;
															
 
																 	}
															
@@ -403,7 +404,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
																 	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
															
 
																-	 _starpu_get_job_associated_to_task(task)->nimpl = 0;//best_impl;
															
 
																+	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
															
 
																 	/* we should now have the best worker in variable "best" */
															
 
																 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
															
@@ -413,7 +414,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 {
															
 
																 	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
															
 
																 	/* find the queue */
															
 
																-	struct starpu_fifo_taskq_s *fifo;
															
 
																+	struct _starpu_fifo_taskq *fifo;
															
 
																 	unsigned worker, worker_ctx;
															
 
																 	int best = -1, best_ctx = -1;
															
@@ -423,13 +424,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 	unsigned nworkers_ctx = starpu_get_nworkers_of_ctx(sched_ctx_id);
															
 
																 	int *workerids = starpu_get_workers_of_ctx(sched_ctx_id);
															
 
																-	double local_task_length[nworkers_ctx];
															
 
																-	double local_data_penalty[nworkers_ctx];
															
 
																-	double local_power[nworkers_ctx];
															
 
																-	double exp_end[nworkers_ctx];
															
 
																+	double local_task_length[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_data_penalty[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_power[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	double max_exp_end = 0.0;
															
 
																-	double fitness[nworkers_ctx];
															
 
																+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	double best_exp_end = 10e240;
															
 
																 	double model_best = 0.0;
															
@@ -438,7 +439,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 	int ntasks_best = -1;
															
 
																 	double ntasks_best_end = 0.0;
															
 
																 	int calibrating = 0;
															
 
																-	
															
 
																+
															
 
																 	/* A priori, we know all estimations */
															
 
																 	int unknown = 0;
															
@@ -457,105 +458,109 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 			if (fifo->exp_end > max_exp_end)
															
 
																 				max_exp_end = fifo->exp_end;
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
 
																 			}
															
 
																 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																-			local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																+			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																 			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
															
 
																 			unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																-			local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																+			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
															
 
																 			if (ntasks_best == -1
															
 
																-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																-					) {
															
 
																+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
															
 
																+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+				)
															
 
																+			{
															
 
																 				ntasks_best_end = ntasks_end;
															
 
																 				ntasks_best = worker;
															
 
																-
															
 
																+				best_impl = nimpl;
															
 
																 			}
															
 
																-			if (local_task_length[worker_ctx] == -1.0)
															
 
																+			if (isnan(local_task_length[worker_ctx][nimpl]))
															
 
																 				/* we are calibrating, we want to speed-up calibration time
															
 
																-			 	* so we privilege non-calibrated tasks (but still
															
 
																-			 	* greedily distribute them to avoid dumb schedules) */
															
 
																+				 * so we privilege non-calibrated tasks (but still
															
 
																+				 * greedily distribute them to avoid dumb schedules) */
															
 
																 				calibrating = 1;
															
 
																-			if (local_task_length[worker_ctx] <= 0.0)
															
 
																+			if (isnan(local_task_length[worker_ctx][nimpl])
															
 
																+					|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
															
 
																 				/* there is no prediction available for that task
															
 
																-			 	* with that arch yet, so switch to a greedy strategy */
															
 
																+				 * with that arch yet, so switch to a greedy strategy */
															
 
																 				unknown = 1;
															
 
																 			if (unknown)
															
 
																 					continue;
															
 
																-			exp_end[worker_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx];
															
 
																+			exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx][nimpl];
															
 
																-			if (exp_end[worker_ctx] < best_exp_end)
															
 
																+			if (exp_end[worker_ctx][nimpl] < best_exp_end)
															
 
																 			{
															
 
																 				/* a better solution was found */
															
 
																-				best_exp_end = exp_end[worker_ctx];
															
 
																+				best_exp_end = exp_end[worker_ctx][nimpl];
															
 
																 				best_impl = nimpl;
															
 
																-
															
 
																 			}
															
 
																-			local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
															
 
																-			if (local_power[worker_ctx] == -1.0)
															
 
																-				local_power[worker_ctx] = 0.;
															
 
																-			}	
															
 
																-		}
															
 
																+			local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch, nimpl);
															
 
																+			if (isnan(local_power[worker_ctx][nimpl]))
															
 
																+				local_power[worker_ctx][nimpl] = 0.;
															
 
																-		if (unknown)
															
 
																-			forced_best = ntasks_best;
															
 
																+		 }
															
 
																+	}
															
 
																+
															
 
																+	if (unknown)
															
 
																+		forced_best = ntasks_best;
															
 
																-		double best_fitness = -1;
															
 
																+	double best_fitness = -1;
															
 
																-		if (forced_best == -1)
															
 
																+	if (forced_best == -1)
															
 
																+	{
															
 
																+		for (worker_ctx = 0; worker_ctx < nworkers_ctx; worker_ctx++)
															
 
																 		{
															
 
																-	        for (worker_ctx = 0; worker_ctx < nworkers_ctx; worker_ctx++)
															
 
																-	        {
															
 
																-		        worker = workerids[worker_ctx];
															
 
																+			worker = workerids[worker_ctx];
															
 
																+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			{	
															
 
																+				if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																+				{
															
 
																+					/* no one on that queue may execute this task */
															
 
																+					continue;
															
 
																+				}
															
 
																-			fifo = dt->queue_array[worker];
															
 
																+					fifo = dt->queue_array[worker];
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, 0))
															
 
																-			{
															
 
																-				/* no one on that queue may execute this task */
															
 
																-				continue;
															
 
																-			}
															
 
																-	
															
 
																-			fitness[worker_ctx] = dt->alpha*(exp_end[worker_ctx] - best_exp_end) 
															
 
																-					+ dt->beta*(local_data_penalty[worker_ctx])
															
 
																-					+ dt->_gamma*(local_power[worker_ctx]);
															
 
																+					fitness[worker_ctx][nimpl] = dt->alpha*(exp_end[worker_ctx][nimpl] - best_exp_end) 
															
 
																+					+ dt->beta*(local_data_penalty[worker_ctx][nimpl])
															
 
																+					+ dt->_gamma*(local_power[worker_ctx][nimpl]);
															
 
																-			if (exp_end[worker_ctx] > max_exp_end)
															
 
																+			if (exp_end[worker_ctx][nimpl] > max_exp_end)
															
 
																 				/* This placement will make the computation
															
 
																 				 * longer, take into account the idle
															
 
																 				 * consumption of other cpus */
															
 
																-				fitness[worker_ctx] += dt->_gamma * dt->idle_power * (exp_end[worker_ctx] - max_exp_end) / 1000000.0;
															
 
																+				fitness[worker_ctx][nimpl] += dt->_gamma * dt->idle_power * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
															
 
																-			if (best == -1 || fitness[worker_ctx] < best_fitness)
															
 
																+			if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
															
 
																 			{
															
 
																 				/* we found a better solution */
															
 
																-				best_fitness = fitness[worker_ctx];
															
 
																+				best_fitness = fitness[worker_ctx][nimpl];
															
 
																 				best = worker;
															
 
																 				best_ctx = worker_ctx;
															
 
																+				best_impl = nimpl;
															
 
																-	//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker], local_power[worker]);
															
 
																+				//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_power[worker][nimpl]);
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
 
																 	STARPU_ASSERT(forced_best != -1 || best != -1);
															
 
																-	
															
 
																+
															
 
																 	if (forced_best != -1)
															
 
																 	{
															
 
																 		/* there is no prediction available for that task
															
@@ -565,10 +570,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 		model_best = 0.0;
															
 
																 		//penality_best = 0.0;
															
 
																 	}
															
 
																-	else 
															
 
																+	else
															
 
																 	{
															
 
																-		model_best = local_task_length[best];
															
 
																-		//penality_best = local_data_penalty[best];
															
 
																+		model_best = local_task_length[best_ctx][best_impl];
															
 
																+		//penality_best = local_data_penalty[best_ctx][best_impl];
															
 
																 	}
															
@@ -581,7 +586,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id)
															
 
																 {
															
 
																-	return _dmda_push_task(task, 2, sched_ctx_id);
															
 
																+	return _dmda_push_task(task, 1, sched_ctx_id);
															
 
																 }
															
 
																 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
															
@@ -680,11 +685,13 @@ static void deinitialize_dmda_policy(unsigned sched_ctx_id)
 
																 }
															
 
																 /* TODO: use post_exec_hook to fix the expected start */
															
 
																-struct starpu_sched_policy_s _starpu_sched_dm_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_dm_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_dmda_policy,
															
 
																 	.deinit_sched = deinitialize_dmda_policy,
															
 
																-	.push_task = dm_push_task, 
															
 
																+	.push_task = dm_push_task,
															
 
																 	.pop_task = dmda_pop_task,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = dmda_pop_every_task,
															
 
																 	.policy_name = "dm",
															
@@ -692,11 +699,13 @@ struct starpu_sched_policy_s _starpu_sched_dm_policy = {
 
																 	.init_sched_for_workers = initialize_dmda_policy_for_workers
															
 
																 };
															
 
																-struct starpu_sched_policy_s _starpu_sched_dmda_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_dmda_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_dmda_policy,
															
 
																 	.deinit_sched = deinitialize_dmda_policy,
															
 
																-	.push_task = dmda_push_task, 
															
 
																+	.push_task = dmda_push_task,
															
 
																 	.pop_task = dmda_pop_task,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = dmda_pop_every_task,
															
 
																 	.policy_name = "dmda",
															
@@ -704,11 +713,13 @@ struct starpu_sched_policy_s _starpu_sched_dmda_policy = {
 
																 	.init_sched_for_workers = initialize_dmda_policy_for_workers
															
 
																 };
															
 
																-struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_dmda_sorted_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_dmda_sorted_policy,
															
 
																 	.deinit_sched = deinitialize_dmda_policy,
															
 
																-	.push_task = dmda_push_sorted_task, 
															
 
																+	.push_task = dmda_push_sorted_task,
															
 
																 	.pop_task = dmda_pop_ready_task,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = dmda_pop_every_task,
															
 
																 	.policy_name = "dmdas",
															
@@ -716,11 +727,13 @@ struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy = {
 
																 	.init_sched_for_workers = initialize_dmda_policy_for_workers
															
 
																 };
															
 
																-struct starpu_sched_policy_s _starpu_sched_dmda_ready_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_dmda_ready_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_dmda_policy,
															
 
																 	.deinit_sched = deinitialize_dmda_policy,
															
 
																-	.push_task = dmda_push_task, 
															
 
																+	.push_task = dmda_push_task,
															
 
																 	.pop_task = dmda_pop_ready_task,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = dmda_pop_every_task,
															
 
																 	.policy_name = "dmdar",
															
--- a/src/sched_policies/deque_queues.c
+++ b/src/sched_policies/deque_queues.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -25,13 +25,13 @@
 
																 #include <errno.h>
															
 
																 #include <common/utils.h>
															
 
																-struct starpu_deque_jobq_s *_starpu_create_deque(void)
															
 
																+struct _starpu_deque_jobq *_starpu_create_deque(void)
															
 
																 {
															
 
																-	struct starpu_deque_jobq_s *deque;
															
 
																-	deque = (struct starpu_deque_jobq_s *) malloc(sizeof(struct starpu_deque_jobq_s));
															
 
																+	struct _starpu_deque_jobq *deque;
															
 
																+	deque = (struct _starpu_deque_jobq *) malloc(sizeof(struct _starpu_deque_jobq));
															
 
																 	/* note that not all mechanisms (eg. the semaphore) have to be used */
															
 
																-	deque->jobq = starpu_job_list_new();
															
 
																+	deque->jobq = _starpu_job_list_new();
															
 
																 	deque->njobs = 0;
															
 
																 	deque->nprocessed = 0;
															
@@ -42,25 +42,25 @@ struct starpu_deque_jobq_s *_starpu_create_deque(void)
 
																 	return deque;
															
 
																 }
															
 
																-void _starpu_destroy_deque(struct starpu_deque_jobq_s *deque)
															
 
																+void _starpu_destroy_deque(struct _starpu_deque_jobq *deque)
															
 
																 {
															
 
																-	starpu_job_list_delete(deque->jobq);
															
 
																+	_starpu_job_list_delete(deque->jobq);
															
 
																 	free(deque);
															
 
																 }
															
 
																-unsigned _starpu_get_deque_njobs(struct starpu_deque_jobq_s *deque_queue)
															
 
																+unsigned _starpu_get_deque_njobs(struct _starpu_deque_jobq *deque_queue)
															
 
																 {
															
 
																 	return deque_queue->njobs;
															
 
																 }
															
 
																-unsigned _starpu_get_deque_nprocessed(struct starpu_deque_jobq_s *deque_queue)
															
 
																+int _starpu_get_deque_nprocessed(struct _starpu_deque_jobq *deque_queue)
															
 
																 {
															
 
																 	return deque_queue->nprocessed;
															
 
																 }
															
 
																-struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_queue, int workerid __attribute__ ((unused)))
															
 
																+struct starpu_task *_starpu_deque_pop_task(struct _starpu_deque_jobq *deque_queue, int workerid)
															
 
																 {
															
 
																-	starpu_job_t j = NULL;
															
 
																+	struct _starpu_job *j = NULL;
															
 
																 	if ((deque_queue->njobs == 0) && _starpu_machine_is_running())
															
 
																 	{
															
@@ -68,66 +68,72 @@ struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_que
 
																 	}
															
 
																 	/* TODO find a task that suits workerid */
															
 
																-	if (deque_queue->njobs > 0) 
															
 
																+	for (j  = _starpu_job_list_begin(deque_queue->jobq);
															
 
																+	     j != _starpu_job_list_end(deque_queue->jobq);
															
 
																+	     j  = _starpu_job_list_next(j))
															
 
																 	{
															
 
																-		/* there is a task */
															
 
																-		j = starpu_job_list_pop_front(deque_queue->jobq);
															
 
																-	
															
 
																+		unsigned nimpl;
															
 
																 		STARPU_ASSERT(j);
															
 
																-		deque_queue->njobs--;
															
 
																-		
															
 
																-		STARPU_TRACE_JOB_POP(j, 0);
															
 
																+
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			if (starpu_worker_can_execute_task(workerid, j->task, nimpl))
															
 
																+			{
															
 
																+				j->nimpl = nimpl;
															
 
																+				j = _starpu_job_list_pop_front(deque_queue->jobq);
															
 
																+				_STARPU_TRACE_JOB_POP(j, 0);
															
 
																+				return j->task;
															
 
																+			}
															
 
																 	}
															
 
																-	
															
 
																-	return j->task;
															
 
																+
															
 
																+	return NULL;
															
 
																 }
															
 
																-struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_s *deque_queue, pthread_mutex_t *sched_mutex, int workerid)
															
 
																+struct _starpu_job_list *_starpu_deque_pop_every_task(struct _starpu_deque_jobq *deque_queue, pthread_mutex_t *sched_mutex, int workerid)
															
 
																 {
															
 
																-	struct starpu_job_list_s *new_list, *old_list;
															
 
																+	struct _starpu_job_list *new_list, *old_list;
															
 
																 	/* block until some task is available in that queue */
															
 
																-	PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																 	if (deque_queue->njobs == 0)
															
 
																 	{
															
 
																 		new_list = NULL;
															
 
																 	}
															
 
																-	else {
															
 
																+	else
															
 
																+	{
															
 
																 		/* there is a task */
															
 
																 		old_list = deque_queue->jobq;
															
 
																-		new_list = starpu_job_list_new();
															
 
																+		new_list = _starpu_job_list_new();
															
 
																 		unsigned new_list_size = 0;
															
 
																-		starpu_job_itor_t i;
															
 
																-		starpu_job_t next_job;
															
 
																+		struct _starpu_job *i;
															
 
																+		struct _starpu_job *next_job;
															
 
																 		/* note that this starts at the _head_ of the list, so we put
															
 
																  		 * elements at the back of the new list */
															
 
																-		for(i = starpu_job_list_begin(old_list);
															
 
																-			i != starpu_job_list_end(old_list);
															
 
																+		for(i = _starpu_job_list_begin(old_list);
															
 
																+			i != _starpu_job_list_end(old_list);
															
 
																 			i  = next_job)
															
 
																 		{
															
 
																-			next_job = starpu_job_list_next(i);
															
 
																+			unsigned nimpl;
															
 
																+			next_job = _starpu_job_list_next(i);
															
 
																-			/* In case there are multiples implementations of the
															
 
																- 			 * codelet for a single device, We dont really care
															
 
																-			 * about the implementation used, so let's try the 
															
 
																-			 * first one. */
															
 
																-			if (starpu_worker_may_execute_task(workerid, i->task, 0))
															
 
																+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			if (starpu_worker_can_execute_task(workerid, i->task, nimpl))
															
 
																 			{
															
 
																 				/* this elements can be moved into the new list */
															
 
																 				new_list_size++;
															
 
																-				
															
 
																-				starpu_job_list_erase(old_list, i);
															
 
																-				starpu_job_list_push_back(new_list, i);
															
 
																+
															
 
																+				_starpu_job_list_erase(old_list, i);
															
 
																+				_starpu_job_list_push_back(new_list, i);
															
 
																+				i->nimpl = nimpl;
															
 
																 			}
															
 
																 		}
															
 
																 		if (new_list_size == 0)
															
 
																 		{
															
 
																 			/* the new list is empty ... */
															
 
																-			starpu_job_list_delete(new_list);
															
 
																+			_starpu_job_list_delete(new_list);
															
 
																 			new_list = NULL;
															
 
																 		}
															
 
																 		else
															
@@ -135,8 +141,8 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_
 
																 			deque_queue->njobs -= new_list_size;
															
 
																 		}
															
 
																 	}
															
 
																-	
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 	return new_list;
															
 
																 }
															
--- a/src/sched_policies/deque_queues.h
+++ b/src/sched_policies/deque_queues.h
@@ -23,15 +23,16 @@
 
																 #include <common/config.h>
															
 
																 #include <core/jobs.h>
															
 
																-struct starpu_deque_jobq_s {
															
 
																+struct _starpu_deque_jobq
															
 
																+{
															
 
																 	/* the actual list */
															
 
																-	starpu_job_list_t jobq;
															
 
																+	struct _starpu_job_list *jobq;
															
 
																 	/* the number of tasks currently in the queue */
															
 
																 	unsigned njobs;
															
 
																 	/* the number of tasks that were processed */
															
 
																-	unsigned nprocessed;
															
 
																+	int nprocessed;
															
 
																 	/* only meaningful if the queue is only used by a single worker */
															
 
																 	double exp_start; /* Expected start date of first task in the queue */
															
@@ -39,14 +40,14 @@ struct starpu_deque_jobq_s {
 
																 	double exp_len; /* Expected duration of the set of tasks in the queue */
															
 
																 };
															
 
																-struct starpu_deque_jobq_s *_starpu_create_deque(void);
															
 
																-void _starpu_destroy_deque(struct starpu_deque_jobq_s *deque);
															
 
																+struct _starpu_deque_jobq *_starpu_create_deque(void);
															
 
																+void _starpu_destroy_deque(struct _starpu_deque_jobq *deque);
															
 
																-struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_queue, int workerid);
															
 
																-struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_s *deque_queue, pthread_mutex_t *sched_mutex, int workerid);
															
 
																+struct starpu_task *_starpu_deque_pop_task(struct _starpu_deque_jobq *deque_queue, int workerid);
															
 
																+struct _starpu_job_list *_starpu_deque_pop_every_task(struct _starpu_deque_jobq *deque_queue, pthread_mutex_t *sched_mutex, int workerid);
															
 
																-unsigned _starpu_get_deque_njobs(struct starpu_deque_jobq_s *deque_queue);
															
 
																-unsigned _starpu_get_deque_nprocessed(struct starpu_deque_jobq_s *deque_queue);
															
 
																+unsigned _starpu_get_deque_njobs(struct _starpu_deque_jobq *deque_queue);
															
 
																+int _starpu_get_deque_nprocessed(struct _starpu_deque_jobq *deque_queue);
															
 
																 #endif // __DEQUE_QUEUES_H__
															
--- a/src/sched_policies/detect_combined_workers.c
+++ b/src/sched_policies/detect_combined_workers.c
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011, 2012       Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -18,12 +19,14 @@
 
																 #include <starpu.h>
															
 
																 #include <common/utils.h>
															
 
																 #include <core/workers.h>
															
 
																+#include <math.h>
															
 
																+#include <sched_policies/detect_combined_workers.h>
															
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 #include <hwloc.h>
															
 
																-/* tree_t
															
 
																- * ======
															
 
																+/* struct _starpu_tree
															
 
																+ * ==================
															
 
																  * Purpose
															
 
																  * =======
															
 
																  * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
															
@@ -39,12 +42,12 @@
 
																  * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
															
 
																  */
															
 
																-typedef struct tree_s{
															
 
																+struct _starpu_tree
															
 
																+{
															
 
																     hwloc_obj_t obj;
															
 
																     unsigned nb_workers;
															
 
																     int *workers;
															
 
																-} tree_t;
															
 
																-
															
 
																+};
															
 
																 /* gather_trees
															
 
																  * ============
															
@@ -65,7 +68,7 @@ typedef struct tree_s{
 
																  *			Number of trees we want to combine (size of the array).
															
 
																  */
															
 
																-static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_source_trees)
															
 
																+static void gather_trees(struct _starpu_tree *target_tree, struct _starpu_tree *source_trees, unsigned nb_source_trees)
															
 
																 {
															
 
																     unsigned tree_id, worker_id, index = 0;
															
 
																     for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
															
@@ -99,7 +102,7 @@ static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_
 
																  *			Maximum size of a combined worker.
															
 
																  */
															
 
																-static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_size, int max_size)
															
 
																+static unsigned assign_multiple_trees(struct _starpu_tree *trees, unsigned nb_trees, unsigned int min_size, unsigned int max_size)
															
 
																 {
															
 
																     unsigned short complete = 0;
															
 
																     unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
															
@@ -197,19 +200,19 @@ static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_
 
																  *			Maximum size of a combined worker.
															
 
																  */
															
 
																-static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree, int min_size, int max_size)
															
 
																+static unsigned find_and_assign_combinations_with_hwloc_recursive(struct _starpu_tree *tree, unsigned int min_size, unsigned int max_size)
															
 
																 {
															
 
																     unsigned subtree_id, nb_workers = 0;
															
 
																     hwloc_obj_t obj = tree->obj;
															
 
																     int *workers = tree->workers;
															
 
																-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+    struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																     /* Is this a leaf ? (eg. a PU for hwloc) */
															
 
																     if (!hwloc_compare_types(config->cpu_depth, obj->depth))
															
 
																     {
															
 
																-	struct starpu_worker_s *worker = obj->userdata;
															
 
																+	struct _starpu_worker *worker = obj->userdata;
															
 
																 	/* If this is a CPU worker add it at the beginning
															
 
																 	 * of the array , write 1 in the field nb_workers and
															
@@ -229,7 +232,7 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
 
																     /* If there is only one child, we go to the next level right away */
															
 
																     if (obj->arity == 1)
															
 
																     {
															
 
																-	tree_t subtree = *tree;
															
 
																+	struct _starpu_tree subtree = *tree;
															
 
																 	subtree.obj = obj->children[0];
															
 
																 	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
															
 
																 	tree->nb_workers = nb_workers;
															
@@ -240,12 +243,12 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
 
																      * CPU leaves that fits between min and max. */
															
 
																     /* We allocate an array of tree structures which will contain the current node's subtrees data */
															
 
																-    tree_t *subtrees = (tree_t *) malloc(obj->arity * sizeof(tree_t));
															
 
																+    struct _starpu_tree *subtrees = (struct _starpu_tree *) malloc(obj->arity * sizeof(struct _starpu_tree));
															
 
																     /* We allocate the array containing the workers of each subtree and initialize the fields left */
															
 
																     for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
															
 
																     {
															
 
																-	tree_t *subtree = subtrees + subtree_id;
															
 
																+	struct _starpu_tree *subtree = subtrees + subtree_id;
															
 
																 	subtree->obj = obj->children[subtree_id];
															
 
																 	subtree->nb_workers = 0;
															
@@ -317,7 +320,7 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
 
																  *			Topology of the machine : used to know the number of cpus.
															
 
																  */
															
 
																-static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machine_topology_s *topology)
															
 
																+static void get_min_max_sizes(unsigned int *min_size, unsigned int *max_size, struct starpu_machine_topology *topology)
															
 
																 {
															
 
																     int _min_size, _max_size;
															
@@ -330,8 +333,8 @@ static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machin
 
																     {
															
 
																 	int nb_cpus = topology->nhwcpus;
															
 
																-	int sqrt_nb_cpus = sqrt(nb_cpus);
															
 
																-	short exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
															
 
																+	int sqrt_nb_cpus = (int)sqrt((double)nb_cpus);
															
 
																+	int exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
															
 
																 	    if(_min_size == -1)
															
 
																 	    {
															
@@ -373,19 +376,19 @@ static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machin
 
																  *			to get the hwloc tree.
															
 
																  */
															
 
																-static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
															
 
																+static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology *topology)
															
 
																 {
															
 
																     unsigned nb_workers;
															
 
																-    int min_size, max_size;
															
 
																+    unsigned int min_size, max_size;
															
 
																     get_min_max_sizes(&min_size, &max_size, topology);
															
 
																     STARPU_ASSERT(min_size <= max_size);
															
 
																-    tree_t tree;
															
 
																+    struct _starpu_tree tree;
															
 
																     /* Of course we start from the root */
															
 
																-    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
															
 
																+    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0);
															
 
																     tree.nb_workers = 0;
															
 
																     tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
															
@@ -399,7 +402,7 @@ static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topolo
 
																     {
															
 
																 	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
															
 
																 	 * while there are enough workers to assign regarding the min_size value */
															
 
																-	STARPU_ASSERT(nb_workers < max_size);
															
 
																+	STARPU_ASSERT(nb_workers <= max_size);
															
 
																 	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
															
 
																 	STARPU_ASSERT(ret >= 0);
															
@@ -410,9 +413,9 @@ static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topolo
 
																 #else /* STARPU_HAVE_HWLOC */
															
 
																-static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
															
 
																+static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology *topology)
															
 
																 {
															
 
																-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+    struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																     /* We put the id of all CPU workers in this array */
															
 
																     int cpu_workers[STARPU_NMAXWORKERS];
															
@@ -440,7 +443,7 @@ static void find_and_assign_combinations_without_hwloc(struct starpu_machine_top
 
																 		/* We register this combination */
															
 
																 		int ret;
															
 
																-		ret = starpu_combined_worker_assign_workerid(size, workerids); 
															
 
																+		ret = starpu_combined_worker_assign_workerid(size, workerids);
															
 
																 		STARPU_ASSERT(ret >= 0);
															
 
																 	    }
															
 
																 	}
															
@@ -449,9 +452,9 @@ static void find_and_assign_combinations_without_hwloc(struct starpu_machine_top
 
																 #endif /* STARPU_HAVE_HWLOC */
															
 
																-static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
															
 
																+static void combine_all_cpu_workers(struct starpu_machine_topology *topology)
															
 
																 {
															
 
																-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+    struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																     int cpu_workers[STARPU_NMAXWORKERS];
															
 
																     unsigned ncpus = 0;
															
@@ -463,21 +466,22 @@ static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 
																 	    cpu_workers[ncpus++] = i;
															
 
																     }
															
 
																-    if (ncpus > 0)
															
 
																+    for (i = 1; i <= ncpus; i++)
															
 
																     {
															
 
																 	int ret;
															
 
																-	ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
															
 
																+	ret = starpu_combined_worker_assign_workerid(i, cpu_workers);
															
 
																 	STARPU_ASSERT(ret >= 0);
															
 
																     }
															
 
																 }
															
 
																-void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
															
 
																+void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology)
															
 
																 {
															
 
																-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+    struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																-    if (config->user_conf && config->user_conf->single_combined_worker > 0 || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
															
 
																+    if ((config->user_conf && config->user_conf->single_combined_worker > 0) || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
															
 
																 	combine_all_cpu_workers(topology);
															
 
																-    else {
															
 
																+    else
															
 
																+    {
															
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 	find_and_assign_combinations_with_hwloc(topology);
															
 
																 #else
															
--- a/src/sched_policies/detect_combined_workers.h
+++ b/src/sched_policies/detect_combined_workers.h
@@ -0,0 +1,21 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011 Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+/* Initialize combined workers */
															
 
																+void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology);
															
 
																+
															
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  INRIA
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -25,7 +25,7 @@
 
																 #include <sched_policies/fifo_queues.h>
															
 
																 typedef struct eager_center_policy_data {
															
 
																-	struct starpu_fifo_taskq_s *fifo;
															
 
																+	struct _starpu_fifo_taskq *fifo;
															
 
																 	pthread_mutex_t sched_mutex;
															
 
																 	pthread_cond_t sched_cond;
															
 
																 } eager_center_policy_data;
															
@@ -109,7 +109,7 @@ static int push_task_eager_policy(struct starpu_task *task, unsigned sched_ctx_i
 
																 		_starpu_increment_nsubmitted_tasks_of_worker(workerid);
															
 
																 	}
															
 
																-	struct starpu_fifo_taskq_s *fifo = data->fifo;
															
 
																+	struct _starpu_fifo_taskq *fifo = data->fifo;
															
 
																 	return _starpu_fifo_push_task(fifo, &data->sched_mutex, &data->sched_cond, task);
															
 
																 }
															
@@ -118,17 +118,17 @@ static struct starpu_task *pop_every_task_eager_policy(unsigned sched_ctx_id)
 
																 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	struct eager_center_policy_data *data = (struct eager_center_policy_data*)sched_ctx->policy_data;
															
 
																-	struct starpu_fifo_taskq_s *fifo = data->fifo;
															
 
																+	static struct _starpu_fifo_taskq *fifo = data->fifo;
															
 
																 	return _starpu_fifo_pop_every_task(fifo, &data->sched_mutex, starpu_worker_get_id());
															
 
																 }
															
 
																 static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
															
 
																 {
															
 
																-        unsigned workerid = starpu_worker_get_id();
															
 
																+    unsigned workerid = starpu_worker_get_id();
															
 
																 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	struct eager_center_policy_data *data = (struct eager_center_policy_data*)sched_ctx->policy_data;
															
 
																-	struct starpu_fifo_taskq_s *fifo = data->fifo;
															
 
																+	static struct _starpu_fifo_taskq *fifo = data->fifo;
															
 
																 	struct starpu_task *task =  _starpu_fifo_pop_task(fifo, workerid);
															
 
																 	if(task)
															
@@ -144,12 +144,14 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
																 	return task;
															
 
																 }
															
 
																-struct starpu_sched_policy_s _starpu_sched_eager_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_eager_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_eager_center_policy,
															
 
																 	.init_sched_for_workers = initialize_eager_center_policy_for_workers,
															
 
																 	.deinit_sched = deinitialize_eager_center_policy,
															
 
																 	.push_task = push_task_eager_policy,
															
 
																 	.pop_task = pop_task_eager_policy,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = pop_every_task_eager_policy,
															
 
																 	.policy_name = "eager",
															
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  INRIA
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -32,8 +32,9 @@
 
																 #define NPRIO_LEVELS	(MAX_LEVEL - MIN_LEVEL + 1)
															
 
																-struct starpu_priority_taskq_s {
															
 
																-	/* the actual lists 
															
 
																+struct starpu_priority_taskq_s
															
 
																+{
															
 
																+	/* the actual lists
															
 
																 	 *	taskq[p] is for priority [p - STARPU_MIN_PRIO] */
															
 
																 	struct starpu_task_list taskq[NPRIO_LEVELS];
															
 
																 	unsigned ntasks[NPRIO_LEVELS];
															
@@ -42,19 +43,19 @@ struct starpu_priority_taskq_s {
 
																 };
															
 
																 typedef struct eager_central_prio_data{
															
 
																-	struct starpu_priority_taskq_s *taskq;
															
 
																+	struct _starpu_priority_taskq *taskq;
															
 
																 	pthread_mutex_t sched_mutex;
															
 
																 	pthread_cond_t sched_cond;
															
 
																 } eager_central_prio_data;
															
 
																 /*
															
 
																- * Centralized queue with priorities 
															
 
																+ * Centralized queue with priorities
															
 
																  */
															
 
																 static struct starpu_priority_taskq_s *_starpu_create_priority_taskq(void)
															
 
																 {
															
 
																 	struct starpu_priority_taskq_s *central_queue;
															
 
																-	
															
 
																+
															
 
																 	central_queue = (struct starpu_priority_taskq_s *) malloc(sizeof(struct starpu_priority_taskq_s));
															
 
																 	central_queue->total_ntasks = 0;
															
@@ -152,24 +153,25 @@ static int _starpu_priority_push_task(struct starpu_task *task, unsigned sched_c
 
																 	struct starpu_priority_taskq_s *taskq = data->taskq;
															
 
																 	/* wake people waiting for a task */
															
 
																-	PTHREAD_MUTEX_LOCK(&data->sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&data->sched_mutex);
															
 
																+
															
 
																+	_STARPU_TRACE_JOB_PUSH(task, 1);
															
 
																-	STARPU_TRACE_JOB_PUSH(task, 1);
															
 
																-	
															
 
																 	unsigned priolevel = task->priority - STARPU_MIN_PRIO;
															
 
																 	starpu_task_list_push_front(&taskq->taskq[priolevel], task);
															
 
																 	taskq->ntasks[priolevel]++;
															
 
																 	taskq->total_ntasks++;
															
 
																-	PTHREAD_COND_SIGNAL(&data->sched_cond);
															
 
																-	PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(&data->sched_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
															
 
																 	return 0;
															
 
																 }
															
 
																 static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
															
 
																 {
															
 
																+	/* XXX FIXME: should call starpu_worker_can_execute_task!! */
															
 
																 	struct starpu_task *task = NULL;
															
 
																 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
@@ -183,39 +185,45 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 
																 	if ((taskq->total_ntasks == 0) && _starpu_machine_is_running())
															
 
																 	{
															
 
																 #ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																-		PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
															
 
																 		return NULL;
															
 
																 #else
															
 
																-		PTHREAD_COND_WAIT(&data->sched_cond, &data->sched_mutex);
															
 
																+		_STARPU_PTHREAD_COND_WAIT(&data->sched_cond, &data->sched_mutex);
															
 
																 #endif
															
 
																 	}
															
 
																 	if (taskq->total_ntasks > 0)
															
 
																 	{
															
 
																 		unsigned priolevel = NPRIO_LEVELS - 1;
															
 
																-		do {
															
 
																-			if (taskq->ntasks[priolevel] > 0) {
															
 
																+		do
															
 
																+		{
															
 
																+			if (taskq->ntasks[priolevel] > 0)
															
 
																+			{
															
 
																 				/* there is some task that we can grab */
															
 
																 				task = starpu_task_list_pop_back(&taskq->taskq[priolevel]);
															
 
																 				taskq->ntasks[priolevel]--;
															
 
																 				taskq->total_ntasks--;
															
 
																-				STARPU_TRACE_JOB_POP(task, 0);
															
 
																+				_STARPU_TRACE_JOB_POP(task, 0);
															
 
																 			}
															
 
																-		} while (!task && priolevel-- > 0);
															
 
																+		}
															
 
																+		while (!task && priolevel-- > 0);
															
 
																 	}
															
 
																+	STARPU_ASSERT_MSG(starpu_worker_can_execute_task(starpu_worker_get_id(), task, 0), "prio does not support \"can_execute\"");
															
 
																-	PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
															
 
																 	return task;
															
 
																 }
															
 
																-struct starpu_sched_policy_s _starpu_sched_prio_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_prio_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_eager_center_priority_policy,
															
 
																 	.init_sched_for_workers = initialize_eager_center_priority_policy_for_workers,
															
 
																 	.deinit_sched = deinitialize_eager_center_priority_policy,
															
 
																 	/* we always use priorities in that policy */
															
 
																 	.push_task = _starpu_priority_push_task,
															
 
																 	.pop_task = _starpu_priority_pop_task,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = NULL,
															
 
																 	.policy_name = "prio",
															
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -25,10 +25,10 @@
 
																 #include <core/task.h>
															
 
																 #include <core/workers.h>
															
 
																-struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
															
 
																+struct _starpu_fifo_taskq *_starpu_create_fifo(void)
															
 
																 {
															
 
																-	struct starpu_fifo_taskq_s *fifo;
															
 
																-	fifo = (struct starpu_fifo_taskq_s *) malloc(sizeof(struct starpu_fifo_taskq_s));
															
 
																+	struct _starpu_fifo_taskq *fifo;
															
 
																+	fifo = (struct _starpu_fifo_taskq *) malloc(sizeof(struct _starpu_fifo_taskq));
															
 
																 	/* note that not all mechanisms (eg. the semaphore) have to be used */
															
 
																 	starpu_task_list_init(&fifo->taskq);
															
@@ -42,65 +42,73 @@ struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
 
																 	return fifo;
															
 
																 }
															
 
																-void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo)
															
 
																+void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo)
															
 
																 {
															
 
																 	free(fifo);
															
 
																 }
															
 
																-/* TODO: revert front/back? */
															
 
																+int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo)
															
 
																+{
															
 
																+	return fifo->ntasks == 0;
															
 
																+}
															
 
																-int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
															
 
																+/* TODO: revert front/back? */
															
 
																+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
															
 
																 {
															
 
																-	PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																-	STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																+	_STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																 	/* TODO: if prio, put at back */
															
 
																 	starpu_task_list_push_front(&fifo_queue->taskq, task);
															
 
																 	fifo_queue->ntasks++;
															
 
																 	fifo_queue->nprocessed++;
															
 
																-	PTHREAD_COND_SIGNAL(sched_cond);
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 	return 0;
															
 
																 }
															
 
																-struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo_queue, int workerid __attribute__ ((unused)))
															
 
																+struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
															
 
																 {
															
 
																-	struct starpu_task *task = NULL;
															
 
																+	struct starpu_task *task;
															
 
																-	if (fifo_queue->ntasks == 0)
															
 
																-		return NULL;
															
 
																-
															
 
																-	/* TODO: find a task that suits workerid */
															
 
																-	if (fifo_queue->ntasks > 0) 
															
 
																+	for (task  = starpu_task_list_begin(&fifo_queue->taskq);
															
 
																+	     task != starpu_task_list_end(&fifo_queue->taskq);
															
 
																+	     task  = starpu_task_list_next(task))
															
 
																 	{
															
 
																-		/* there is a task */
															
 
																-		task = starpu_task_list_pop_back(&fifo_queue->taskq);
															
 
																-	
															
 
																+		unsigned nimpl;
															
 
																 		STARPU_ASSERT(task);
															
 
																-		fifo_queue->ntasks--;
															
 
																-		
															
 
																-		STARPU_TRACE_JOB_POP(task, 0);
															
 
																+
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			if (starpu_worker_can_execute_task(workerid, task, nimpl))
															
 
																+			{
															
 
																+				_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
															
 
																+				starpu_task_list_erase(&fifo_queue->taskq, task);
															
 
																+				fifo_queue->ntasks--;
															
 
																+				_STARPU_TRACE_JOB_POP(task, 0);
															
 
																+				return task;
															
 
																+			}
															
 
																 	}
															
 
																-	
															
 
																-	return task;
															
 
																+
															
 
																+	return NULL;
															
 
																 }
															
 
																 /* pop every task that can be executed on the calling driver */
															
 
																-struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, int workerid)
															
 
																+struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, int workerid)
															
 
																 {
															
 
																 	struct starpu_task_list *old_list;
															
 
																 	unsigned size;
															
 
																 	struct starpu_task *new_list = NULL;
															
 
																 	struct starpu_task *new_list_tail = NULL;
															
 
																-	
															
 
																-	PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																 	size = fifo_queue->ntasks;
															
 
																-	if (size > 0) {
															
 
																+	if (size > 0)
															
 
																+	{
															
 
																 		old_list = &fifo_queue->taskq;
															
 
																 		unsigned new_list_size = 0;
															
@@ -110,13 +118,15 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 
																 		task = starpu_task_list_front(old_list);
															
 
																 		while (task)
															
 
																 		{
															
 
																+			unsigned nimpl;
															
 
																 			next_task = task->next;
															
 
																-			if (starpu_worker_may_execute_task(workerid, task, 0))
															
 
																+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+			if (starpu_worker_can_execute_task(workerid, task, nimpl))
															
 
																 			{
															
 
																 				/* this elements can be moved into the new list */
															
 
																 				new_list_size++;
															
 
																-				
															
 
																+
															
 
																 				starpu_task_list_erase(old_list, task);
															
 
																 				if (new_list_tail)
															
@@ -126,21 +136,24 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 
																 					task->next = NULL;
															
 
																 					new_list_tail = task;
															
 
																 				}
															
 
																-				else {
															
 
																+				else
															
 
																+				{
															
 
																 					new_list = task;
															
 
																 					new_list_tail = task;
															
 
																 					task->prev = NULL;
															
 
																 					task->next = NULL;
															
 
																 				}
															
 
																+				_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
															
 
																+				break;
															
 
																 			}
															
 
																-		
															
 
																+
															
 
																 			task = next_task;
															
 
																 		}
															
 
																 		fifo_queue->ntasks -= new_list_size;
															
 
																 	}
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 	return new_list;
															
 
																 }
															
--- a/src/sched_policies/fifo_queues.h
+++ b/src/sched_policies/fifo_queues.h
@@ -22,7 +22,8 @@
 
																 #include <starpu.h>
															
 
																 #include <common/config.h>
															
 
																-struct starpu_fifo_taskq_s {
															
 
																+struct _starpu_fifo_taskq
															
 
																+{
															
 
																 	/* the actual list */
															
 
																 	struct starpu_task_list taskq;
															
@@ -38,12 +39,14 @@ struct starpu_fifo_taskq_s {
 
																 	double exp_len; /* Expected duration of the set of tasks in the queue */
															
 
																 };
															
 
																-struct starpu_fifo_taskq_s*_starpu_create_fifo(void);
															
 
																-void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo);
															
 
																+struct _starpu_fifo_taskq*_starpu_create_fifo(void);
															
 
																+void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo);
															
 
																-int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
															
 
																+int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
															
 
																-struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo, int workerid);
															
 
																-struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, int workerid);
															
 
																+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
															
 
																+
															
 
																+struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);
															
 
																+struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, int workerid);
															
 
																 #endif // __FIFO_QUEUES_H__
															
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  * Copyright (C) 2011  INRIA
															
 
																  *
															
@@ -23,10 +23,25 @@
 
																 #include <core/workers.h>
															
 
																 #include <core/perfmodel/perfmodel.h>
															
 
																+#include <core/task_bundle.h>
															
 
																+#include <core/workers.h>
															
 
																 #include <starpu_parameters.h>
															
 
																 #include <starpu_task_bundle.h>
															
 
																 #include <starpu_top.h>
															
 
																+#ifndef DBL_MIN
															
 
																+#define DBL_MIN __DBL_MIN__
															
 
																+#endif
															
 
																+
															
 
																+#ifndef DBL_MAX
															
 
																+#define DBL_MAX __DBL_MAX__
															
 
																+#endif
															
 
																+
															
 
																+static double exp_start[STARPU_NMAXWORKERS]; /* of the first queued task */
															
 
																+static double exp_end[STARPU_NMAXWORKERS];   /* of the set of queued tasks */
															
 
																+static double exp_len[STARPU_NMAXWORKERS];   /* of the last queued task */
															
 
																+static double ntasks[STARPU_NMAXWORKERS];
															
 
																+
															
 
																 typedef struct {
															
 
																 	double alpha;
															
 
																 	double beta;
															
@@ -34,12 +49,6 @@ typedef struct {
 
																 	double idle_power;
															
 
																 } heft_data;
															
 
																-static double exp_start[STARPU_NMAXWORKERS];	/* of the first queued task */
															
 
																-static double exp_end[STARPU_NMAXWORKERS];	/* of the set of queued tasks */
															
 
																-static double exp_len[STARPU_NMAXWORKERS];	/* of the last queued task */
															
 
																-static double ntasks[STARPU_NMAXWORKERS];
															
 
																-
															
 
																-
															
 
																 const float alpha_minimum=0;
															
 
																 const float alpha_maximum=10.0;
															
 
																 const float beta_minimum=0;
															
@@ -49,7 +58,8 @@ const float gamma_maximum=10000.0;
 
																 const float idle_power_minimum=0;
															
 
																 const float idle_power_maximum=10000.0;
															
 
																-void param_modified(struct starputop_param_t* d){
															
 
																+static void param_modified(struct starpu_top_param* d)
															
 
																+{
															
 
																 	//just to show parameter modification
															
 
																 	fprintf(stderr,"%s has been modified : %f !\n", d->name, d->value);
															
 
																 }
															
@@ -125,13 +135,16 @@ static void heft_init(unsigned sched_ctx_id)
 
																 	starputop_register_parameter_float("HEFT_IDLE_POWER", &hd->idle_power, idle_power_minimum,idle_power_maximum,param_modified);
															
 
																 }
															
 
																-static void heft_post_exec_hook(struct starpu_task *task)
															
 
																+
															
 
																+/* heft_pre_exec_hook is called right after the data transfer is done and right before
															
 
																+ * the computation to begin, it is useful to update more precisely the value
															
 
																+ * of the expected start, end, length, etc... */
															
 
																+static void heft_pre_exec_hook(struct starpu_task *task)
															
 
																 {
															
 
																 	unsigned sched_ctx_id = task->sched_ctx;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																-	STARPU_ASSERT(workerid >= 0);
															
 
																-
															
 
																 	double model = task->predicted;
															
 
																+	double transfer_model = task->predicted_transfer;
															
 
																 	pthread_mutex_t *sched_mutex;
															
 
																 	pthread_cond_t *sched_cond;
															
@@ -144,18 +157,21 @@ static void heft_post_exec_hook(struct starpu_task *task)
 
																 		sched_cond = &workerarg->sched_cond;
															
 
																 		starpu_worker_set_sched_condition(sched_ctx_id, workerid, sched_mutex, sched_cond);
															
 
																 	}
															
 
																-#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																-	starpu_call_poped_task_cb(workerid, sched_ctx_id, task->flops);
															
 
																-#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																-
															
 
																-	/* Once we have executed the task, we can update the predicted amount
															
 
																+	/* Once the task is executing, we can update the predicted amount
															
 
																 	 * of work. */
															
 
																-	PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																-	exp_len[workerid] -= model;
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+	exp_len[workerid] -= model + transfer_model;
															
 
																 	exp_start[workerid] = starpu_timing_now() + model;
															
 
																 	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
															
 
																 	ntasks[workerid]--;
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+}
															
 
																+
															
 
																+static void heft_post_exec_hook(struct starpu_task *task)
															
 
																+{
															
 
																+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																+	starpu_call_poped_task_cb(workerid, sched_ctx_id, task->flops);
															
 
																+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																 }
															
 
																 static void heft_push_task_notify(struct starpu_task *task, int workerid)
															
@@ -163,10 +179,12 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 
																 	unsigned sched_ctx_id = task->sched_ctx;
															
 
																 	/* Compute the expected penality */
															
 
																 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
															
 
																+	unsigned memory_node = starpu_worker_get_memory_node(workerid);
															
 
																 	double predicted = starpu_task_expected_length(task, perf_arch,
															
 
																 			_starpu_get_job_associated_to_task(task)->nimpl);
															
 
																+	double predicted_transfer = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																 	pthread_mutex_t *sched_mutex;
															
 
																 	pthread_cond_t *sched_cond;
															
 
																 	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
															
@@ -184,25 +202,45 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 
																 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																 	/* Update the predictions */
															
 
																-	PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																 	/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																 	exp_start[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
															
 
																-	exp_end[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
															
 
																+	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
															
 
																 	/* If there is no prediction available, we consider the task has a null length */
															
 
																-	if (predicted != -1.0)
															
 
																+	if (!isnan(predicted))
															
 
																 	{
															
 
																 		task->predicted = predicted;
															
 
																 		exp_end[workerid] += predicted;
															
 
																 		exp_len[workerid] += predicted;
															
 
																 	}
															
 
																+	/* If there is no prediction available, we consider the task has a null length */
															
 
																+	if (!isnan(predicted_transfer))
															
 
																+	{
															
 
																+		if (starpu_timing_now() + predicted_transfer < exp_end[workerid])
															
 
																+		{
															
 
																+			/* We may hope that the transfer will be finished by
															
 
																+			 * the start of the task. */
															
 
																+			predicted_transfer = 0;
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			/* The transfer will not be finished by then, take the
															
 
																+			 * remainder into account */
															
 
																+			predicted_transfer = (starpu_timing_now() + predicted_transfer) - exp_end[workerid];
															
 
																+		}
															
 
																+		task->predicted_transfer = predicted_transfer;
															
 
																+		exp_end[workerid] += predicted_transfer;
															
 
																+		exp_len[workerid] += predicted_transfer;
															
 
																+	}
															
 
																+
															
 
																 	ntasks[workerid]++;
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 }
															
 
																-static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, int prio, unsigned sched_ctx_id)
															
 
																+static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, double predicted_transfer, int prio, unsigned sched_ctx_id)
															
 
																  {
															
 
																 	/* make sure someone coule execute that task ! */
															
 
																 	STARPU_ASSERT(best_workerid != -1);
															
@@ -225,16 +263,38 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 	starpu_call_pushed_task_cb(best_workerid, sched_ctx_id);
															
 
																 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
															
 
																-	PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+
															
 
																+	/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																+	exp_start[best_workerid] = STARPU_MAX(exp_start[best_workerid], starpu_timing_now());
															
 
																+	exp_end[best_workerid] = exp_start[best_workerid] + exp_len[best_workerid];
															
 
																+
															
 
																 	exp_end[best_workerid] += predicted;
															
 
																 	exp_len[best_workerid] += predicted;
															
 
																+
															
 
																+	if (starpu_timing_now() + predicted_transfer < exp_end[best_workerid])
															
 
																+	{
															
 
																+		/* We may hope that the transfer will be finished by
															
 
																+		 * the start of the task. */
															
 
																+		predicted_transfer = 0;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* The transfer will not be finished by then, take the
															
 
																+		 * remainder into account */
															
 
																+		predicted_transfer = (starpu_timing_now() + predicted_transfer) - exp_end[best_workerid];
															
 
																+	}
															
 
																+	exp_end[best_workerid] += predicted_transfer;
															
 
																+	exp_len[best_workerid] += predicted_transfer;
															
 
																+
															
 
																 	ntasks[best_workerid]++;
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 	task->predicted = predicted;
															
 
																+	task->predicted_transfer = predicted_transfer;
															
 
																-	if (starpu_top_status_get())
															
 
																-		starputop_task_prevision(task, best_workerid, 
															
 
																+	if (_starpu_top_status_get())
															
 
																+		_starpu_top_task_prevision(task, best_workerid,
															
 
																 					(unsigned long long)(exp_end[best_workerid]-predicted)/1000,
															
 
																 					(unsigned long long)exp_end[best_workerid]/1000);
															
@@ -244,29 +304,32 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 		starpu_prefetch_task_input_on_node(task, memory_node);
															
 
																 	}
															
 
																+
															
 
																+	//_STARPU_DEBUG("Heft : pushing local task\n");
															
 
																 	return starpu_push_local_task(best_workerid, task, prio);
															
 
																 }
															
 
																+/* TODO: factorize with dmda!! */
															
 
																 static void compute_all_performance_predictions(struct starpu_task *task,
															
 
																 					double *local_task_length, double *exp_end,
															
 
																 					double *max_exp_endp, double *best_exp_endp,
															
 
																 					double *local_data_penalty,
															
 
																-					double *local_power, int *forced_best,
															
 
																-					struct starpu_task_bundle *bundle,
															
 
																+					double *local_power, 
															
 
																+					int *forced_worker, int *forced_impl,
															
 
																+					starpu_task_bundle_t bundle,
															
 
																 					unsigned sched_ctx_id)
															
 
																 {
															
 
																 	int calibrating = 0;
															
 
																 	double max_exp_end = DBL_MIN;
															
 
																 	double best_exp_end = DBL_MAX;
															
 
																 	int ntasks_best = -1;
															
 
																+	int nimpl_best = 0;
															
 
																 	double ntasks_best_end = 0.0;
															
 
																-	
															
 
																+
															
 
																 	/* A priori, we know all estimations */
															
 
																 	int unknown = 0;
															
 
																-	
															
 
																-	unsigned nimpl;
															
 
																-	unsigned best_impl = 0;
															
 
																 	unsigned worker, worker_ctx = 0;
															
 
																+	unsigned nimpl;
															
 
																 	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
															
@@ -277,101 +340,142 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
																 		{
															
 
																 			/* Sometimes workers didn't take the tasks as early as we expected */
															
 
																 			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
															
 
																-			exp_end[worker_ctx] = exp_start[worker] + exp_len[worker];
															
 
																-			if (exp_end[worker_ctx] > max_exp_end)
															
 
																- 				max_exp_end = exp_end[worker_ctx];
															
 
																+			exp_end[worker_ctx][nimpl] = exp_start[worker] + exp_len[worker];
															
 
																+			if (exp_end[worker_ctx][nimpl] > max_exp_end)
															
 
																+ 				max_exp_end = exp_end[worker_ctx][nimpl];
															
 
																-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																 				continue;
															
 
																 			}
															
 
																-			
															
 
																+
															
 
																 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																 			unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																-			
															
 
																+
															
 
																 			if (bundle)
															
 
																 			{
															
 
																-				local_task_length[worker_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
															
 
																-				local_data_penalty[worker_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
															
 
																-				local_power[worker_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
															
 
																+				/* TODO : conversion time */
															
 
																+				local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
															
 
																+				local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
															
 
																+				local_power[worker_ctx][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
															
 
																 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
															
 
																 			}
															
 
																 			else 
															
 
																 			{
															
 
																-				local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																-				local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																-				local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
															
 
																+				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
															
 
																+				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																+				local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch, nimpl);
															
 
																+				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
															
 
																+				if (conversion_time > 0.0)
															
 
																+					local_task_length[worker_ctx][nimpl] += conversion_time;
															
 
																 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
															
 
																 			}
															
 
																-			
															
 
																+
															
 
																 			double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
															
 
																-			
															
 
																+
															
 
																 			if (ntasks_best == -1
															
 
																-			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-			    || (!calibrating && local_task_length[worker_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-			    || (calibrating && local_task_length[worker_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																-				) 
															
 
																+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better worker */
															
 
																+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
															
 
																+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+				)
															
 
																 			{
															
 
																 				ntasks_best_end = ntasks_end;
															
 
																 				ntasks_best = worker;
															
 
																 			}
															
 
																-			
															
 
																-			if (local_task_length[worker_ctx] == -1.0)
															
 
																+
															
 
																+			if (isnan(local_task_length[worker_ctx][nimpl]))
															
 
																 				/* we are calibrating, we want to speed-up calibration time
															
 
																 				 * so we privilege non-calibrated tasks (but still
															
 
																 				 * greedily distribute them to avoid dumb schedules) */
															
 
																 				calibrating = 1;
															
 
																-			
															
 
																-			if (local_task_length[worker_ctx] <= 0.0)
															
 
																+
															
 
																+			if (isnan(local_task_length[worker_ctx][nimpl])
															
 
																+				|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
															
 
																 				/* there is no prediction available for that task
															
 
																-				 * with that arch yet, so switch to a greedy strategy */
															
 
																+				 * with that arch (yet or at all), so switch to a greedy strategy */
															
 
																 				unknown = 1;
															
 
																-			
															
 
																+
															
 
																 			if (unknown)
															
 
																 				continue;
															
 
																-			exp_end[worker_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_ctx];
															
 
																+			exp_end[worker_ctx][nimpl] = exp_start[worker] + exp_len[worker] + local_task_length[worker_ctx][nimpl];
															
 
																-			if (exp_end[worker_ctx] < best_exp_end)
															
 
																+			if (exp_end[worker_ctx][nimpl] < best_exp_end)
															
 
																 			{
															
 
																 				/* a better solution was found */
															
 
																-				best_exp_end = exp_end[worker_ctx];
															
 
																-				best_impl = nimpl;
															
 
																+				best_exp_end = exp_end[worker_ctx][nimpl];
															
 
																+				nimpl_best = nimpl;
															
 
																 			}
															
 
																-			
															
 
																-			if (local_power[worker_ctx] == -1.0)
															
 
																-				local_power[worker_ctx] = 0.;
															
 
																+
															
 
																+			if (isnan(local_power[worker_ctx][nimpl]))
															
 
																+				local_power[worker_ctx][nimpl] = 0.;
															
 
																+
															
 
																 		}
															
 
																 		worker_ctx++;
															
 
																 	}
															
 
																-	*forced_best = unknown?ntasks_best:-1;
															
 
																+	*forced_worker = unknown?ntasks_best:-1;
															
 
																+	*forced_impl = unknown?nimpl_best:-1;
															
 
																 	*best_exp_endp = best_exp_end;
															
 
																 	*max_exp_endp = max_exp_end;
															
 
																-	
															
 
																-	/* save the best implementation */
															
 
																-	//_STARPU_DEBUG("Scheduler heft: kernel (%u)\n", best_impl);
															
 
																-	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
															
 
																 }
															
 
																+static int push_conversion_tasks(struct starpu_task *task, unsigned int workerid)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+	int ret;
															
 
																+	unsigned int node = starpu_worker_get_memory_node(workerid);
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
															
 
																+	for (i = 0; i < task->cl->nbuffers; i++)
															
 
																+	{
															
 
																+		struct starpu_task *conversion_task;
															
 
																+		starpu_data_handle_t handle;
															
 
																+
															
 
																+		handle = task->handles[i];
															
 
																+		if (!_starpu_handle_needs_conversion_task(handle, node))
															
 
																+			continue;
															
 
																+
															
 
																+		conversion_task = _starpu_create_conversion_task(handle, node);
															
 
																+		conversion_task->execute_on_a_specific_worker = 1;
															
 
																+		conversion_task->workerid = workerid;
															
 
																+		conversion_task->mf_skip = 1;
															
 
																+		ret = _starpu_task_submit_conversion_task(conversion_task, workerid);
															
 
																+		STARPU_ASSERT(ret == 0);
															
 
																+	}
															
 
																+
															
 
																+	for (i = 0; i < task->cl->nbuffers; i++)
															
 
																+		task->handles[i]->mf_node = node;
															
 
																+
															
 
																+	task->execute_on_a_specific_worker = 1;
															
 
																+	task->workerid = workerid;
															
 
																+	task->mf_skip= 1;
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[workerid]);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+
															
 
																 static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
															
 
																 {
															
 
																 	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
															
 
																-	unsigned worker, worker_ctx = 0;
															
 
																+	unsigned worker, nimpl, worker_ctx = 0;
															
 
																 	int best = -1, best_id_ctx = -1;
															
 
																-	
															
 
																+	int selected_impl= -1;
															
 
																+
															
 
																 	/* this flag is set if the corresponding worker is selected because
															
 
																 	   there is no performance prediction available yet */
															
 
																-	int forced_best;
															
 
																+	int forced_worker;
															
 
																+	int forced_impl;
															
 
																 	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
															
 
																 	unsigned nworkers_ctx = workers->nworkers;
															
 
																-	double local_task_length[nworkers_ctx];
															
 
																-	double local_data_penalty[nworkers_ctx];
															
 
																-	double local_power[nworkers_ctx];
															
 
																-	double exp_end[nworkers_ctx];
															
 
																+	double local_task_length[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_data_penalty[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_power[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	double max_exp_end = 0.0;
															
 
																 	double best_exp_end;
															
@@ -381,93 +485,116 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 	 *	and detect if there is some calibration that needs to be done.
															
 
																 	 */
															
 
																-	struct starpu_task_bundle *bundle = task->bundle;
															
 
																-
															
 
																-	if(workers->init_cursor)
															
 
																-		workers->init_cursor(workers);
															
 
																+	starpu_task_bundle_t bundle = task->bundle;
															
 
																 	compute_all_performance_predictions(task, local_task_length, exp_end,
															
 
																-					    &max_exp_end, &best_exp_end,
															
 
																-					    local_data_penalty,
															
 
																-					    local_power, &forced_best, bundle, sched_ctx_id);
															
 
																+					&max_exp_end, &best_exp_end,
															
 
																+					local_data_penalty,
															
 
																+					local_power, &forced_worker, &forced_impl,
															
 
																+					bundle, sched_ctx_id);
															
 
																+
															
 
																 	/* If there is no prediction available for that task with that arch we
															
 
																 	 * want to speed-up calibration time so we force this measurement */
															
 
																-	if (forced_best != -1){
															
 
																-		return push_task_on_best_worker(task, forced_best, 0.0, prio, sched_ctx_id);
															
 
																+	if (forced_worker != -1)
															
 
																+	{
															
 
																+		_starpu_get_job_associated_to_task(task)->nimpl = forced_impl;
															
 
																+
															
 
																+		if (_starpu_task_uses_multiformat_handles(task) && !task->mf_skip)
															
 
																+		{
															
 
																+			/*
															
 
																+			 * Our task uses multiformat handles, which may need to be converted.
															
 
																+			 */
															
 
																+			push_conversion_tasks(task, forced_worker);
															
 
																+			prio = 0;
															
 
																+		}
															
 
																+
															
 
																+		return push_task_on_best_worker(task, forced_worker, 0.0, 0.0, prio, sched_ctx_id;
															
 
																 	}
															
 
																-	
															
 
																+
															
 
																 	/*
															
 
																 	 *	Determine which worker optimizes the fitness metric which is a
															
 
																 	 *	trade-off between load-balacing, data locality, and energy
															
 
																 	 *	consumption.
															
 
																 	 */
															
 
																-	double fitness[nworkers_ctx];
															
 
																+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	double best_fitness = -1;
															
 
																 	while(workers->has_next(workers))
															
 
																 	{
															
 
																 		worker = workers->get_next(workers);
															
 
																-		if (!starpu_worker_may_execute_task(worker, task, 0))
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 		{
															
 
																-		        worker_ctx++;
															
 
																-			/* no one on that queue may execute this task */
															
 
																-			continue;
															
 
																-		}
															
 
																+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
 
																+			{
															
 
																+				worker_ctx++;
															
 
																+				/* no one on that queue may execute this task */
															
 
																+				continue;
															
 
																+			}
															
 
																-		fitness[worker_ctx] = hd->alpha*(exp_end[worker_ctx] - best_exp_end) 
															
 
																-				+ hd->beta*(local_data_penalty[worker_ctx])
															
 
																-				+ hd->_gamma*(local_power[worker_ctx]);
															
 
																-		if (exp_end[worker_ctx] > max_exp_end)
															
 
																+			fitness[worker_ctx][nimpl] = hd->alpha*(exp_end[worker_ctx][nimpl] - best_exp_end) 
															
 
																+						+ hd->beta*(local_data_penalty[worker_ctx][nimpl])
															
 
																+						+ hd->_gamma*(local_power[worker_ctx][nimpl]);
															
 
																+
															
 
																+		if (exp_end[worker_ctx][nimpl] > max_exp_end)
															
 
																 			/* This placement will make the computation
															
 
																 			 * longer, take into account the idle
															
 
																 			 * consumption of other cpus */
															
 
																-			fitness[worker_ctx] += hd->_gamma * hd->idle_power * (exp_end[worker_ctx] - max_exp_end) / 1000000.0;
															
 
																+			fitness[worker_ctx][nimpl] += hd->_gamma * hd->idle_power * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
															
 
																-		if (best == -1 || fitness[worker_ctx] < best_fitness)
															
 
																+		if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
															
 
																 		{
															
 
																 			/* we found a better solution */
															
 
																-			best_fitness = fitness[worker_ctx];
															
 
																+			best_fitness = fitness[worker_ctx][nimpl];
															
 
																 			best = worker;
															
 
																 			best_id_ctx = worker_ctx;
															
 
																+			selected_impl = nimpl;
															
 
																 		}
															
 
																 		worker_ctx++;
															
 
																 	}
															
 
																 	/* By now, we must have found a solution */
															
 
																 	STARPU_ASSERT(best != -1);
															
 
																-	
															
 
																+
															
 
																 	/* we should now have the best worker in variable "best" */
															
 
																-	double model_best;
															
 
																+	double model_best, transfer_model_best;
															
 
																 	if (bundle)
															
 
																 	{
															
 
																 		/* If we have a task bundle, we have computed the expected
															
 
																 		 * length for the entire bundle, but not for the task alone. */
															
 
																 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
															
 
																-		model_best = starpu_task_expected_length(task, perf_arch,
															
 
																-				_starpu_get_job_associated_to_task(task)->nimpl);
															
 
																+		unsigned memory_node = starpu_worker_get_memory_node(best);
															
 
																+		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
															
 
																+		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																 		/* Remove the task from the bundle since we have made a
															
 
																 		 * decision for it, and that other tasks should not consider it
															
 
																 		 * anymore. */
															
 
																-		PTHREAD_MUTEX_LOCK(&bundle->mutex);
															
 
																-		int ret = starpu_task_bundle_remove(bundle, task);
															
 
																-		
															
 
																-		/* Perhaps the bundle was destroyed when removing the last
															
 
																-		 * entry */
															
 
																-		if (ret != 1)
															
 
																-			PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
															
 
																+		starpu_task_bundle_remove(bundle, task);
															
 
																 	}
															
 
																 	else 
															
 
																 	{
															
 
																-		model_best = local_task_length[best_id_ctx];
															
 
																+		model_best = local_task_length[best_id_ctx][selected_impl];
															
 
																+		transfer_model_best = local_data_penalty[best_id_ctx][selected_impl];
															
 
																 	}
															
 
																 	if(workers->init_cursor)
															
 
																 		workers->deinit_cursor(workers);
															
 
																-	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
															
 
																+
															
 
																+	_starpu_get_job_associated_to_task(task)->nimpl = selected_impl;
															
 
																+
															
 
																+	if (_starpu_task_uses_multiformat_handles(task) && !task->mf_skip)
															
 
																+	{
															
 
																+		/*
															
 
																+		 * Our task uses multiformat handles, which may need to be converted.
															
 
																+		 */
															
 
																+		push_conversion_tasks(task, forced_worker);
															
 
																+		prio = 0;
															
 
																+	}
															
 
																+
															
 
																+	return push_task_on_best_worker(task, best, model_best, transfer_model_best, prio, sched_ctx_id);
															
 
																 }
															
 
																 static int heft_push_task(struct starpu_task *task)
															
@@ -510,13 +637,14 @@ static void heft_deinit(unsigned sched_ctx_id)
 
																 	free(ht);
															
 
																 }
															
 
																-struct starpu_sched_policy_s heft_policy = {
															
 
																+struct starpu_sched_policy heft_policy = {
															
 
																 	.init_sched = heft_init,
															
 
																 	.deinit_sched = heft_deinit,
															
 
																-	.push_task = heft_push_task, 
															
 
																+	.push_task = heft_push_task,
															
 
																 	.push_task_notify = heft_push_task_notify,
															
 
																 	.pop_task = NULL,
															
 
																 	.pop_every_task = NULL,
															
 
																+	.pre_exec_hook = heft_pre_exec_hook,
															
 
																 	.post_exec_hook = heft_post_exec_hook,
															
 
																 	.add_workers = heft_add_workers	,
															
 
																 	.remove_workers = heft_remove_workers,
															
--- a/src/sched_policies/parallel_greedy.c
+++ b/src/sched_policies/parallel_greedy.c
@@ -19,10 +19,11 @@
 
																 #include <core/workers.h>
															
 
																 #include <sched_policies/fifo_queues.h>
															
 
																 #include <common/barrier.h>
															
 
																+#include <sched_policies/detect_combined_workers.h>
															
 
																 typedef struct pgreedy_data {
															
 
																-	struct starpu_fifo_taskq_s *fifo;
															
 
																-	struct starpu_fifo_taskq_s *local_fifo[STARPU_NMAXWORKERS];
															
 
																+	struct _starpu_fifo_taskq *fifo;
															
 
																+	struct _starpu_fifo_taskq *local_fifo[STARPU_NMAXWORKERS];
															
 
																 	int master_id[STARPU_NMAXWORKERS];
															
@@ -60,7 +61,7 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
																 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
															
 
																 	{
															
 
																-    	        workerid = sched_ctx->workerids[workerid_ctx];
															
 
																+		workerid = sched_ctx->workerids[workerid_ctx];
															
 
																 		int cnt = possible_combinations_cnt[workerid]++;
															
 
																 		possible_combinations[workerid][cnt] = workerid;
															
@@ -94,15 +95,15 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
																 		}
															
 
																 	}
															
 
																-	PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
															
 
																-	PTHREAD_COND_INIT(&data->sched_cond, NULL);
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
															
 
																+	_STARPU_PTHREAD_COND_INIT(&data->sched_cond, NULL);
															
 
																 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
															
 
																 	{
															
 
																 		workerid = sched_ctx->workerids[workerid_ctx];
															
 
																-		PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
															
 
																-		PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
															
 
																+		_STARPU_PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
															
 
																+		_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
															
 
																 	}
															
 
																 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
															
@@ -204,20 +205,20 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 
																 			if (possible_combinations_size[workerid][i] > best_size)
															
 
																 			{
															
 
																 				int combined_worker = possible_combinations[workerid][i];
															
 
																-				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
															
 
																+				if (starpu_combined_worker_can_execute_task(combined_worker, task, 0))
															
 
																 				{
															
 
																 					best_size = possible_combinations_size[workerid][i];
															
 
																 					best_workerid = combined_worker;
															
 
																 				}
															
 
																 			}
															
 
																-		} 
															
 
																+		}
															
 
																 		/* In case nobody can execute this task, we let the master
															
 
																 		 * worker take it anyway, so that it can discard it afterward.
															
 
																 		 * */
															
 
																 		if (best_workerid == -1)
															
 
																 			return task;
															
 
																-		
															
 
																+
															
 
																 		/* Is this a basic worker or a combined worker ? */
															
 
																 		int nbasic_workers = (int)starpu_worker_get_count();
															
 
																 		int is_basic_worker = (best_workerid < nbasic_workers);
															
@@ -227,23 +228,24 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 
																 			/* The master is alone */
															
 
																 			return task;
															
 
																 		}
															
 
																-		else {
															
 
																+		else
															
 
																+		{
															
 
																 			/* The master needs to dispatch the task between the
															
 
																 			 * different combined workers */
															
 
																-			struct starpu_combined_worker_s *combined_worker;
															
 
																+			struct _starpu_combined_worker *combined_worker;
															
 
																 			combined_worker = _starpu_get_combined_worker_struct(best_workerid);
															
 
																 			int worker_size = combined_worker->worker_size;
															
 
																 			int *combined_workerid = combined_worker->combined_workerid;
															
 
																-			starpu_job_t j = _starpu_get_job_associated_to_task(task);
															
 
																+			struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
															
 
																 			j->task_size = worker_size;
															
 
																 			j->combined_workerid = best_workerid;
															
 
																 			j->active_task_alias_count = 0;
															
 
																 			//fprintf(stderr, "POP -> size %d best_size %d\n", worker_size, best_size);
															
 
																-			PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
															
 
																-			PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
															
 
																+			_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
															
 
																+			_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
															
 
																 			/* Dispatch task aliases to the different slaves */
															
 
																 			for (i = 1; i < worker_size; i++)
															
@@ -261,17 +263,20 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 
																 			return master_alias;
															
 
																 		}
															
 
																 	}
															
 
																-	else {
															
 
																+	else
															
 
																+	{
															
 
																 		/* The worker is a slave */
															
 
																 		return _starpu_fifo_pop_task(data->local_fifo[workerid], workerid);
															
 
																 	}
															
 
																 }
															
 
																-struct starpu_sched_policy_s _starpu_sched_pgreedy_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_pgreedy_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_pgreedy_policy,
															
 
																 	.deinit_sched = deinitialize_pgreedy_policy,
															
 
																 	.push_task = push_task_pgreedy_policy,
															
 
																 	.pop_task = pop_task_pgreedy_policy,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = NULL,
															
 
																 	.policy_name = "pgreedy",
															
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2012 inria
															
 
																+ * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -23,10 +24,19 @@
 
																 #include <core/perfmodel/perfmodel.h>
															
 
																 #include <starpu_parameters.h>
															
 
																 #include <common/barrier.h>
															
 
																+#include <sched_policies/detect_combined_workers.h>
															
 
																+
															
 
																+#ifndef DBL_MIN
															
 
																+#define DBL_MIN __DBL_MIN__
															
 
																+#endif
															
 
																+
															
 
																+#ifndef DBL_MAX
															
 
																+#define DBL_MAX __DBL_MAX__
															
 
																+#endif
															
 
																 static pthread_mutex_t big_lock;
															
 
																-static unsigned  ncombinedworkers;
															
 
																+static unsigned nworkers, ncombinedworkers;
															
 
																 //static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
															
 
																 //static unsigned napplicable_perf_archtypes = 0;
															
@@ -50,17 +60,19 @@ static void parallel_heft_post_exec_hook(struct starpu_task *task, unsigned sche
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	double model = task->predicted;
															
 
																 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	if (model < 0.0)
															
 
																+	double transfer_model = task->predicted_transfer;
															
 
																+
															
 
																+	if (isnan(model))
															
 
																 		model = 0.0;
															
 
																-	
															
 
																+
															
 
																 	/* Once we have executed the task, we can update the predicted amount
															
 
																 	 * of work. */
															
 
																-	PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[workerid]);
															
 
																-	worker_exp_len[workerid] -= model;
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[workerid]);
															
 
																+	worker_exp_len[workerid] -= model + transfer_model;
															
 
																 	worker_exp_start[workerid] = starpu_timing_now();
															
 
																 	worker_exp_end[workerid] = worker_exp_start[workerid] + worker_exp_len[workerid];
															
 
																 	ntasks[workerid]--;
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[workerid]);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[workerid]);
															
 
																 }
															
 
																 static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double exp_end_predicted, int prio, struct starpu_sched_ctx *sched_ctx)
															
@@ -73,7 +85,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 	int nbasic_workers = sched_ctx->nworkers;
															
 
																 	int is_basic_worker = (best_workerid < nbasic_workers);
															
 
																-	unsigned memory_node; 
															
 
																+	unsigned memory_node;
															
 
																 	memory_node = starpu_worker_get_memory_node(best_workerid);
															
 
																 	if (starpu_get_prefetch_flag())
															
@@ -81,33 +93,37 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 	int ret = 0;
															
 
																-	PTHREAD_MUTEX_LOCK(&big_lock);
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
															
 
																 	if (is_basic_worker)
															
 
																 	{
															
 
																 		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
															
 
																-		worker_exp_len[best_workerid] += exp_end_predicted - worker_exp_end[best_workerid];
															
 
																+		/* TODO */
															
 
																+		task->predicted_transfer = 0;
															
 
																+		worker_exp_len[best_workerid] += task->predicted;
															
 
																 		worker_exp_end[best_workerid] = exp_end_predicted;
															
 
																 		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
															
 
																-	
															
 
																+
															
 
																 		ntasks[best_workerid]++;
															
 
																 		ret = starpu_push_local_task(best_workerid, task, prio);
															
 
																 	}
															
 
																-	else {
															
 
																+	else
															
 
																+	{
															
 
																 		/* This is a combined worker so we create task aliases */
															
 
																-		struct starpu_combined_worker_s *combined_worker;
															
 
																+		struct _starpu_combined_worker *combined_worker;
															
 
																 		combined_worker = _starpu_get_combined_worker_struct(best_workerid);
															
 
																 		int worker_size = combined_worker->worker_size;
															
 
																 		int *combined_workerid = combined_worker->combined_workerid;
															
 
																-		starpu_job_t j = _starpu_get_job_associated_to_task(task);
															
 
																+		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
															
 
																 		j->task_size = worker_size;
															
 
																 		j->combined_workerid = best_workerid;
															
 
																 		j->active_task_alias_count = 0;
															
 
																+		task->predicted_transfer = 0;
															
 
																-		PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
															
 
																-		PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
															
 
																+		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
															
 
																+		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
															
 
																 		int i;
															
 
																 		for (i = 0; i < worker_size; i++)
															
@@ -116,31 +132,34 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 			int local_worker = combined_workerid[i];
															
 
																 			alias->predicted = exp_end_predicted - worker_exp_end[local_worker];
															
 
																-	
															
 
																-			worker_exp_len[local_worker] += exp_end_predicted - worker_exp_end[local_worker];
															
 
																+			/* TODO */
															
 
																+			alias->predicted_transfer = 0;
															
 
																+
															
 
																+			worker_exp_len[local_worker] += alias->predicted;
															
 
																 			worker_exp_end[local_worker] = exp_end_predicted;
															
 
																 			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
															
 
																-		
															
 
																+
															
 
																 			ntasks[local_worker]++;
															
 
																-	
															
 
																+
															
 
																 			ret |= starpu_push_local_task(local_worker, alias, prio);
															
 
																 		}
															
 
																 	}
															
 
																-	PTHREAD_MUTEX_UNLOCK(&big_lock);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
															
 
																 	return ret;
															
 
																 }
															
 
																-static double compute_expected_end(int workerid, double length, int nworkers)
															
 
																+static double compute_expected_end(int workerid, double length)
															
 
																 {
															
 
																 	if (workerid < (int)nworkers)
															
 
																 	{
															
 
																 		/* This is a basic worker */
															
 
																 		return worker_exp_start[workerid] + worker_exp_len[workerid] + length;
															
 
																 	}
															
 
																-	else {
															
 
																+	else
															
 
																+	{
															
 
																 		/* This is a combined worker, the expected end is the end for the latest worker */
															
 
																 		int worker_size;
															
 
																 		int *combined_workerid;
															
@@ -161,7 +180,7 @@ static double compute_expected_end(int workerid, double length, int nworkers)
 
																 	}
															
 
																 }
															
 
																-static double compute_ntasks_end(int workerid, int nworkers)
															
 
																+static double compute_ntasks_end(int workerid)
															
 
																 {
															
 
																 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
															
 
																 	if (workerid < (int)nworkers)
															
@@ -169,7 +188,8 @@ static double compute_ntasks_end(int workerid, int nworkers)
 
																 		/* This is a basic worker */
															
 
																 		return ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
															
 
																 	}
															
 
																-	else {
															
 
																+	else
															
 
																+	{
															
 
																 		/* This is a combined worker, the expected end is the end for the latest worker */
															
 
																 		int worker_size;
															
 
																 		int *combined_workerid;
															
@@ -181,7 +201,7 @@ static double compute_ntasks_end(int workerid, int nworkers)
 
																 		for (i = 0; i < worker_size; i++)
															
 
																 		{
															
 
																 			/* XXX: this is actually bogus: not all pushed tasks are necessarily parallel... */
															
 
																-			ntasks_end = STARPU_MAX(ntasks_end, ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch));
															
 
																+			ntasks_end = STARPU_MAX(ntasks_end, (int) ((double) ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch)));
															
 
																 		}
															
 
																 		return ntasks_end;
															
@@ -199,22 +219,22 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
																 	/* this flag is set if the corresponding worker is selected because
															
 
																 	   there is no performance prediction available yet */
															
 
																-	int forced_best = -1, forced_best_ctx = -1;
															
 
																+	int forced_best = -1, forced_best_ctx = -1, forced_nimpl = -1;
															
 
																-	double local_task_length[nworkers_ctx + ncombinedworkers];
															
 
																-	double local_data_penalty[nworkers_ctx + ncombinedworkers];
															
 
																-	double local_power[nworkers_ctx + ncombinedworkers];
															
 
																-	double local_exp_end[nworkers_ctx + ncombinedworkers];
															
 
																-	double fitness[nworkers_ctx + ncombinedworkers];
															
 
																+	double local_task_length[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_data_penalty[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_power[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double local_exp_end[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																+	double fitness[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	double max_exp_end = 0.0;
															
 
																-	int skip_worker[nworkers_ctx + ncombinedworkers];
															
 
																+	int skip_worker[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
															
 
																 	double best_exp_end = DBL_MAX;
															
 
																 	//double penality_best = 0.0;
															
 
																-	int ntasks_best = -1, ntasks_best_ctx = -1;
															
 
																+	int ntasks_best = -1, ntasks_best_ctx = -1, nimpl_best = -1;
															
 
																 	double ntasks_best_end = 0.0;
															
 
																 	int calibrating = 0;
															
@@ -232,48 +252,51 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
																 	}
															
 
																 	unsigned nimpl;
															
 
																-	unsigned best_impl = 0;
															
 
																 	for (worker_ctx = 0; worker_ctx < (nworkers_ctx + ncombinedworkers); worker_ctx++)
															
 
																  	{
															
 
																 		worker = sched_ctx->workerids[worker_ctx];
															
 
																 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 		{
															
 
																-			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
															
 
																+			if (!starpu_combined_worker_can_execute_task(worker, task, nimpl))
															
 
																 			{
															
 
																 				/* no one on that queue may execute this task */
															
 
																-				skip_worker[worker] = 1;
															
 
																+				skip_worker[worker][nimpl] = 1;
															
 
																 				continue;
															
 
																 			}
															
 
																-			else {
															
 
																-				skip_worker[worker] = 0;
															
 
																+			else
															
 
																+			{
															
 
																+				skip_worker[worker][nimpl] = 0;
															
 
																 			}
															
 
																 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																-			local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch,nimpl);
															
 
																+			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
															
 
																 			unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																-			local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																+			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
															
 
																-			double ntasks_end = compute_ntasks_end(worker, nworkers_ctx);
															
 
																+			double ntasks_end = compute_ntasks_end(worker);
															
 
																 			if (ntasks_best == -1
															
 
																-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																-					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
															
 
																-					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																-					) {
															
 
																+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
															
 
																+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
															
 
																+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
															
 
																+					)
															
 
																+			{
															
 
																 				ntasks_best_end = ntasks_end;
															
 
																 				ntasks_best = worker;
															
 
																 				ntasks_best_ctx = worker_ctx;
															
 
																+				nimpl_best = nimpl;
															
 
																 			}
															
 
																-			if (local_task_length[worker_ctx] == -1.0)
															
 
																+			if (isnan(local_task_length[worker_ctx][nimpl]))
															
 
																 				/* we are calibrating, we want to speed-up calibration time
															
 
																 				 * so we privilege non-calibrated tasks (but still
															
 
																 				 * greedily distribute them to avoid dumb schedules) */
															
 
																 				calibrating = 1;
															
 
																-			if (local_task_length[worker_ctx] <= 0.0)
															
 
																+			if (isnan(local_task_length[worker_ctx][nimpl])
															
 
																+					|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
															
 
																 				/* there is no prediction available for that task
															
 
																 				 * with that arch yet, so switch to a greedy strategy */
															
 
																 				unknown = 1;
															
@@ -281,23 +304,23 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
																 			if (unknown)
															
 
																 				continue;
															
 
																-			local_exp_end[worker_ctx] = compute_expected_end(worker, local_task_length[worker], nworkers_ctx);
															
 
																+			local_exp_end[worker_ctx][nimpl] = compute_expected_end(worker, local_task_length[worker_ctx][nimpl]);
															
 
																-			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
															
 
																+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker_ctx][nimpl], local_exp_end[worker][nimpl]);
															
 
																-			if (local_exp_end[worker_ctx] < best_exp_end)
															
 
																+			if (local_exp_end[worker_ctx][nimpl] < best_exp_end)
															
 
																 			{
															
 
																 				/* a better solution was found */
															
 
																-				best_exp_end = local_exp_end[worker_ctx];
															
 
																-				best_impl = nimpl;
															
 
																+				best_exp_end = local_exp_end[worker_ctx][nimpl];
															
 
																+				nimpl_best = nimpl;
															
 
																 			}
															
 
																-			local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch,nimpl);
															
 
																+			local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
															
 
																 			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
															
 
																-			if (local_power[worker_ctx] == -1.0)
															
 
																-				local_power[worker_ctx] = 0.;
															
 
																+			if (isnan(local_power[worker_ctx][nimpl]))
															
 
																+				local_power[worker_ctx][nimpl] = 0.;
															
 
																 		} //end for
															
 
																 	}
															
@@ -306,9 +329,9 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
																 	{
															
 
																 		forced_best = ntasks_best;
															
 
																 		forced_best_ctx = ntasks_best_ctx;
															
 
																+		forced_nimpl = nimpl_best;
															
 
																 	}
															
 
																-
															
 
																 	double best_fitness = -1;
															
@@ -320,32 +343,35 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
																 			worker = worker_ctx >= nworkers_ctx ? worker_ctx : 
															
 
																 				sched_ctx->workerids[worker_ctx];
															
 
																-			if (skip_worker[worker_ctx])
															
 
																+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 			{
															
 
																-				/* no one on that queue may execute this task */
															
 
																-				continue;
															
 
																+				if (skip_worker[worker_ctx][nimpl])
															
 
																+				{
															
 
																+					/* no one on that queue may execute this task */
															
 
																+					continue;
															
 
																+				}
															
 
																+
															
 
																+				fitness[worker_ctx][nimpl] = hd->alpha*(local_exp_end[worker_ctx][nimpl] - best_exp_end) 
															
 
																+						+ hd->beta*(local_data_penalty[worker_ctx][nimpl])
															
 
																+						+ hd->_gamma*(local_power[worker_ctx][nimpl]);
															
 
																+
															
 
																+				if (local_exp_end[worker_ctx][nimpl] > max_exp_end)
															
 
																+					/* This placement will make the computation
															
 
																+					 * longer, take into account the idle
															
 
																+					 * consumption of other cpus */
															
 
																+					fitness[worker_ctx][nimpl] += hd->_gamma * hd->idle_power * (local_exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
															
 
																+
															
 
																+				if (best == -1 || fitness[worker_ctx] < best_fitness)
															
 
																+				{
															
 
																+					/* we found a better solution */
															
 
																+					best_fitness = fitness[worker_ctx][nimpl];
															
 
																+					best = worker;
															
 
																+					best_id_ctx = worker_ctx;
															
 
																+					nimpl_best = nimpl;
															
 
																+				}
															
 
																+
															
 
																+			//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker][nimpl], local_exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl]);
															
 
																 			}
															
 
																-	
															
 
																-			fitness[worker_ctx] = hd->alpha*(local_exp_end[worker_ctx] - best_exp_end) 
															
 
																-					+ hd->beta*(local_data_penalty[worker_ctx])
															
 
																-					+ hd->_gamma*(local_power[worker_ctx]);
															
 
																-
															
 
																-			if (local_exp_end[worker_ctx] > max_exp_end)
															
 
																-				/* This placement will make the computation
															
 
																-				 * longer, take into account the idle
															
 
																-				 * consumption of other cpus */
															
 
																-				fitness[worker_ctx] += hd->_gamma * hd->idle_power * (local_exp_end[worker_ctx] - max_exp_end) / 1000000.0;
															
 
																-
															
 
																-			if (best == -1 || fitness[worker_ctx] < best_fitness)
															
 
																-			{
															
 
																-				/* we found a better solution */
															
 
																-				best_fitness = fitness[worker_ctx];
															
 
																-				best = worker;
															
 
																-				best_id_ctx = worker_ctx;
															
 
																-			}
															
 
																-
															
 
																-		//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker], local_exp_end[worker] - best_exp_end, local_data_penalty[worker]);
															
 
																-		}
															
 
																 	}
															
 
																 	STARPU_ASSERT(forced_best != -1 || best != -1);
															
@@ -357,18 +383,19 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
																 		 * so we force this measurement */
															
 
																 		best = forced_best;
															
 
																 		best_id_ctx = forced_best_ctx;
															
 
																+		nimpl_best = forced_nimpl;
															
 
																 		//penality_best = 0.0;
															
 
																-		best_exp_end = local_exp_end[best_id_ctx];
															
 
																+		best_exp_end = compute_expected_end(best, 0);
															
 
																 	}
															
 
																-	else 
															
 
																+	else
															
 
																 	{
															
 
																-                //penality_best = local_data_penalty[best];
															
 
																-		best_exp_end = local_exp_end[best_id_ctx];
															
 
																+		//penality_best = local_data_penalty[best_id_ctx][nimpl_best];
															
 
																+		best_exp_end = local_exp_end[best_id_ctx][nimpl_best];
															
 
																 	}
															
 
																-	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", best_impl);
															
 
																-	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
															
 
																+	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", nimpl_best);
															
 
																+	_starpu_get_job_associated_to_task(task)->nimpl = nimpl_best;
															
 
																 	/* we should now have the best worker in variable "best" */
															
 
																 	return push_task_on_best_worker(task, best, best_exp_end, prio, sched_ctx);
															
 
																 }
															
@@ -452,11 +479,11 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
																 			worker_exp_end[workerid] = worker_exp_start[workerid]; 
															
 
																 			ntasks[workerid] = 0;
															
 
																 		}
															
 
																-		PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
															
 
																-		PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
															
 
																+		_STARPU_PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
															
 
																+		_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
															
 
																 	}
															
 
																-	PTHREAD_MUTEX_INIT(&big_lock, NULL);
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&big_lock, NULL);
															
 
																 	/* We pre-compute an array of all the perfmodel archs that are applicable */
															
 
																 	unsigned total_worker_count = nworkers_ctx + ncombinedworkers;
															
@@ -500,12 +527,14 @@ static void parallel_heft_deinit(unsigned sched_ctx_id)
 
																 }
															
 
																 /* TODO: use post_exec_hook to fix the expected start */
															
 
																-struct starpu_sched_policy_s _starpu_sched_parallel_heft_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_parallel_heft_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_parallel_heft_policy,
															
 
																 	.init_sched_for_workers = parallel_heft_init_for_workers,
															
 
																 	.deinit_sched = parallel_heft_deinit,
															
 
																 	.push_task = parallel_heft_push_task, 
															
 
																 	.pop_task = NULL,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = parallel_heft_post_exec_hook,
															
 
																 	.pop_every_task = NULL,
															
 
																 	.policy_name = "pheft",
															
--- a/src/sched_policies/random_policy.c
+++ b/src/sched_policies/random_policy.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011  INRIA
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -52,7 +52,8 @@ static int _random_push_task(struct starpu_task *task, unsigned prio, struct sta
 
																 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
															
 
																 		double worker_alpha = starpu_worker_get_relative_speedup(perf_arch);
															
 
																-		if (alpha + worker_alpha > random) {
															
 
																+		if (alpha + worker_alpha > random && starpu_worker_can_execute_task(worker, task, 0))
															
 
																+		{
															
 
																 			/* we found the worker */
															
 
																 			selected = worker;
															
 
																 			break;
															
@@ -72,7 +73,7 @@ static int random_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 
																 {
															
 
																 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-    return _random_push_task(task, 0, sched_ctx);
															
 
																+    return _random_push_task(task, !!task->priority, sched_ctx);
															
 
																 }
															
 
																 static void initialize_random_policy_for_workers(unsigned sched_ctx_id, int *workerids, unsigned nnew_workers) 
															
@@ -109,12 +110,14 @@ static void initialize_random_policy(unsigned sched_ctx_id)
 
																 	}
															
 
																 }
															
 
																-struct starpu_sched_policy_s _starpu_sched_random_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_random_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_random_policy,
															
 
																 	.init_sched_for_workers = initialize_random_policy_for_workers,
															
 
																 	.deinit_sched = NULL,
															
 
																 	.push_task = random_push_task,
															
 
																 	.pop_task = NULL,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = NULL,
															
 
																 	.policy_name = "random",
															
--- a/src/sched_policies/stack_queues.c
+++ b/src/sched_policies/stack_queues.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -22,7 +22,7 @@
 
																 #include <errno.h>
															
 
																 #include <common/utils.h>
															
 
																-/* keep track of the total number of jobs to be scheduled to avoid infinite 
															
 
																+/* keep track of the total number of jobs to be scheduled to avoid infinite
															
 
																  * polling when there are really few jobs in the overall queue */
															
 
																 static unsigned total_number_of_jobs;
															
@@ -31,12 +31,12 @@ void _starpu_init_stack_queues_mechanisms(void)
 
																 	total_number_of_jobs = 0;
															
 
																 }
															
 
																-struct starpu_stack_jobq_s *_starpu_create_stack(void)
															
 
																+struct _starpu_stack_jobq *_starpu_create_stack(void)
															
 
																 {
															
 
																-	struct starpu_stack_jobq_s *stack;
															
 
																-	stack = (struct starpu_stack_jobq_s *) malloc(sizeof(struct starpu_stack_jobq_s));
															
 
																+	struct _starpu_stack_jobq *stack;
															
 
																+	stack = (struct _starpu_stack_jobq *) malloc(sizeof(struct _starpu_stack_jobq));
															
 
																-	stack->jobq = starpu_job_list_new();
															
 
																+	stack->jobq = _starpu_job_list_new();
															
 
																 	stack->njobs = 0;
															
 
																 	stack->nprocessed = 0;
															
@@ -47,58 +47,58 @@ struct starpu_stack_jobq_s *_starpu_create_stack(void)
 
																 	return stack;
															
 
																 }
															
 
																-unsigned _starpu_get_stack_njobs(struct starpu_stack_jobq_s *stack_queue)
															
 
																+unsigned _starpu_get_stack_njobs(struct _starpu_stack_jobq *stack_queue)
															
 
																 {
															
 
																 	return stack_queue->njobs;
															
 
																 }
															
 
																-unsigned _starpu_get_stack_nprocessed(struct starpu_stack_jobq_s *stack_queue)
															
 
																+unsigned _starpu_get_stack_nprocessed(struct _starpu_stack_jobq *stack_queue)
															
 
																 {
															
 
																 	return stack_queue->nprocessed;
															
 
																 }
															
 
																-void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task)
															
 
																+void _starpu_stack_push_task(struct _starpu_stack_jobq *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct _starpu_job *task)
															
 
																 {
															
 
																-	PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																 	total_number_of_jobs++;
															
 
																-	STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																+	_STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																 	if (task->task->priority)
															
 
																-		starpu_job_list_push_back(stack_queue->jobq, task);
															
 
																+		_starpu_job_list_push_back(stack_queue->jobq, task);
															
 
																 	else
															
 
																-		starpu_job_list_push_front(stack_queue->jobq, task);
															
 
																+		_starpu_job_list_push_front(stack_queue->jobq, task);
															
 
																 	stack_queue->njobs++;
															
 
																 	stack_queue->nprocessed++;
															
 
																-	PTHREAD_COND_SIGNAL(sched_cond);
															
 
																-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 }
															
 
																-starpu_job_t _starpu_stack_pop_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, int workerid __attribute__ ((unused)))
															
 
																+struct _starpu_job *_starpu_stack_pop_task(struct _starpu_stack_jobq *stack_queue, pthread_mutex_t *sched_mutex, int workerid __attribute__ ((unused)))
															
 
																 {
															
 
																-	starpu_job_t j = NULL;
															
 
																+	struct _starpu_job *j = NULL;
															
 
																 	if (stack_queue->njobs == 0)
															
 
																 		return NULL;
															
 
																 	/* TODO find a task that suits workerid */
															
 
																-	if (stack_queue->njobs > 0) 
															
 
																+	if (stack_queue->njobs > 0)
															
 
																 	{
															
 
																 		/* there is a task */
															
 
																-		j = starpu_job_list_pop_back(stack_queue->jobq);
															
 
																-	
															
 
																+		j = _starpu_job_list_pop_back(stack_queue->jobq);
															
 
																+
															
 
																 		STARPU_ASSERT(j);
															
 
																 		stack_queue->njobs--;
															
 
																-		
															
 
																-		STARPU_TRACE_JOB_POP(j, 0);
															
 
																-		/* we are sure that we got it now, so at worst, some people thought 
															
 
																+		_STARPU_TRACE_JOB_POP(j, 0);
															
 
																+
															
 
																+		/* we are sure that we got it now, so at worst, some people thought
															
 
																 		 * there remained some work and will soon discover it is not true */
															
 
																-		PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																 		total_number_of_jobs--;
															
 
																-		PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 	}
															
 
																-	
															
 
																+
															
 
																 	return j;
															
 
																 }
															
--- a/src/sched_policies/stack_queues.h
+++ b/src/sched_policies/stack_queues.h
@@ -23,9 +23,10 @@
 
																 #include <common/config.h>
															
 
																 #include <core/jobs.h>
															
 
																-struct starpu_stack_jobq_s {
															
 
																+struct _starpu_stack_jobq
															
 
																+{
															
 
																 	/* the actual list */
															
 
																-	starpu_job_list_t jobq;
															
 
																+	struct _starpu_job_list *jobq;
															
 
																 	/* the number of tasks currently in the queue */
															
 
																 	unsigned njobs;
															
@@ -39,17 +40,17 @@ struct starpu_stack_jobq_s {
 
																 	double exp_len; /* Expected duration of the set of tasks in the queue */
															
 
																 };
															
 
																-struct starpu_stack_jobq_s *_starpu_create_stack(void);
															
 
																+struct _starpu_stack_jobq *_starpu_create_stack(void);
															
 
																-void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task);
															
 
																+void _starpu_stack_push_task(struct _starpu_stack_jobq *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct _starpu_job *task);
															
 
																-starpu_job_t _starpu_stack_pop_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, int workerid);
															
 
																+struct _starpu_job *_starpu_stack_pop_task(struct _starpu_stack_jobq *stack, pthread_mutex_t *sched_mutex, int workerid);
															
 
																 void _starpu_init_stack_queues_mechanisms(void);
															
 
																-unsigned _starpu_get_stack_njobs(struct starpu_stack_jobq_s *stack);
															
 
																-unsigned _starpu_get_stack_nprocessed(struct starpu_stack_jobq_s *stack);
															
 
																+unsigned _starpu_get_stack_njobs(struct _starpu_stack_jobq *stack);
															
 
																+unsigned _starpu_get_stack_nprocessed(struct _starpu_stack_jobq *stack);
															
 
																 #endif // __STACK_QUEUES_H__
															
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																- * Copyright (C) 2011  INRIA
															
 
																+ * Copyright (C) 2011, 2012  INRIA
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -18,11 +18,13 @@
 
																 /* Work stealing policy */
															
 
																+#include <float.h>
															
 
																+
															
 
																 #include <core/workers.h>
															
 
																 #include <sched_policies/deque_queues.h>
															
 
																 typedef struct work_stealing_data{
															
 
																-	struct starpu_deque_jobq_s **queue_array;
															
 
																+	struct _starpu_deque_jobq **queue_array;
															
 
																 	unsigned rr_worker;
															
 
																 	/* keep track of the work performed from the beginning of the algorithm to make
															
 
																 	 * better decisions about which queue to select when stealing or deferring work
															
@@ -30,107 +32,198 @@ typedef struct work_stealing_data{
 
																 	unsigned performed_total;
															
 
																 	pthread_mutex_t sched_mutex;
															
 
																 	pthread_cond_t sched_cond;
															
 
																+	unsigned last_pop_worker;
															
 
																+static unsigned last_push_worker;
															
 
																 } work_stealing_data;
															
 
																 #ifdef USE_OVERLOAD
															
 
																-static float overload_metric(struct starpu_deque_jobq_s *dequeue_queue, unsigned *performed_total)
															
 
																+
															
 
																+/**
															
 
																+ * Minimum number of task we wait for being processed before we start assuming
															
 
																+ * on which worker the computation would be faster.
															
 
																+ */
															
 
																+static int calibration_value = 0;
															
 
																+
															
 
																+#endif /* USE_OVERLOAD */
															
 
																+
															
 
																+
															
 
																+/**
															
 
																+ * Return a worker from which a task can be stolen.
															
 
																+ * Selecting a worker is done in a round-robin fashion, unless
															
 
																+ * the worker previously selected doesn't own any task,
															
 
																+ * then we return the first non-empty worker.
															
 
																+ */
															
 
																+static unsigned select_victim_round_robin(struct starpu_sched_ctx *sched_ctx)
															
 
																 {
															
 
																-	float execution_ratio = 0.0f;
															
 
																-	if (*performed_total > 0) {
															
 
																-		execution_ratio = _starpu_get_deque_nprocessed(dequeue_queue)/ *performed_total;
															
 
																+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
															
 
																+	unsigned worker = ws->last_pop_worker;
															
 
																+
															
 
																+	/* If the worker's queue is empty, let's try
															
 
																+	 * the next ones */
															
 
																+	while (!ws->queue_array[worker]->njobs)
															
 
																+	{
															
 
																+		worker = (worker + 1) % sched_ctx->nworkers;
															
 
																+		if (worker == ws->last_pop_worker)
															
 
																+		{
															
 
																+			/* We got back to the first worker,
															
 
																+			 * don't go in infinite loop */
															
 
																+			break;
															
 
																+		}
															
 
																 	}
															
 
																-	unsigned performed_queue;
															
 
																-	performed_queue = _starpu_get_deque_nprocessed(dequeue_queue);
															
 
																+	ws->last_pop_worker = (worker + 1) % sched_ctx->nworkers;
															
 
																-	float current_ratio = 0.0f;
															
 
																-	if (performed_queue > 0) {
															
 
																-		current_ratio = _starpu_get_deque_njobs(dequeue_queue)/performed_queue;
															
 
																-	}
															
 
																-	
															
 
																-	return (current_ratio - execution_ratio);
															
 
																+	return worker;
															
 
																 }
															
 
																-/* who to steal work to ? */
															
 
																-static struct starpu_deque_jobq_s *select_victimq(work_stealing_data *ws, unsigned nworkers)
															
 
																+/**
															
 
																+ * Return a worker to whom add a task.
															
 
																+ * Selecting a worker is done in a round-robin fashion.
															
 
																+ */
															
 
																+static unsigned select_worker_round_robin(struct starpu_sched_ctx *sched_ctx)
															
 
																 {
															
 
																-	struct starpu_deque_jobq_s *q;
															
 
																+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
															
 
																+	unsigned worker = ws->last_push_worker;
															
 
																-	unsigned attempts = nworkers;
															
 
																+	last_push_worker = (last_push_worker + 1) % sched_ctx->nworkers;
															
 
																-	unsigned worker = ws->rr_worker;
															
 
																-	do {
															
 
																-		if (overload_metric(worker) > 0.0f)
															
 
																-		{
															
 
																-			q = ws->queue_array[worker];
															
 
																-			return q;
															
 
																-		}
															
 
																-		else {
															
 
																-			worker = (worker + 1)%nworkers;
															
 
																-		}
															
 
																-	} while(attempts-- > 0);
															
 
																+	return worker;
															
 
																+}
															
 
																+
															
 
																+#ifdef USE_OVERLOAD
															
 
																+
															
 
																+/**
															
 
																+ * Return a ratio helpful to determine whether a worker is suitable to steal
															
 
																+ * tasks from or to put some tasks in its queue.
															
 
																+ *
															
 
																+ * \return	a ratio with a positive or negative value, describing the current state of the worker :
															
 
																+ * 		a smaller value implies a faster worker with an relatively emptier queue : more suitable to put tasks in
															
 
																+ * 		a bigger value implies a slower worker with an reletively more replete queue : more suitable to steal tasks from
															
 
																+ */
															
 
																+static float overload_metric(struct starpu_sched_ctx *sched_ctx, unsigned id)
															
 
																+{
															
 
																+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
															
 
																+	float execution_ratio = 0.0f;
															
 
																+	float current_ratio = 0.0f;
															
 
																+
															
 
																+	int nprocessed = _starpu_get_deque_nprocessed(ws->queue_array[id]);
															
 
																+	unsigned njobs = _starpu_get_deque_njobs(ws->queue_array[id]);
															
 
																-	/* take one anyway ... */
															
 
																-	q = ws->queue_array[ws->rr_worker];
															
 
																-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
															
 
																+	/* Did we get enough information ? */
															
 
																+	if (performed_total > 0 && nprocessed > 0)
															
 
																+	{
															
 
																+		/* How fast or slow is the worker compared to the other workers */
															
 
																+		execution_ratio = (float) nprocessed / performed_total;
															
 
																+		/* How replete is its queue */
															
 
																+		current_ratio = (float) njobs / nprocessed;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		return 0.0f;
															
 
																+	}
															
 
																-	return q;
															
 
																+	return (current_ratio - execution_ratio);
															
 
																 }
															
 
																-static struct starpu_deque_jobq_s *select_workerq(work_stealing_data *ws, unsigned nworkers)
															
 
																+/**
															
 
																+ * Return the most suitable worker from which a task can be stolen.
															
 
																+ * The number of previously processed tasks, total and local,
															
 
																+ * and the number of tasks currently awaiting to be processed
															
 
																+ * by the tasks are taken into account to select the most suitable
															
 
																+ * worker to steal task from.
															
 
																+ */
															
 
																+static unsigned select_victim_overload(struct starpu_sched_ctx *sched_ctx)
															
 
																 {
															
 
																-	struct starpu_deque_jobq_s *q;
															
 
																+	unsigned worker, worker_ctx;
															
 
																+	float  worker_ratio;
															
 
																+	unsigned best_worker = 0;
															
 
																+	float best_ratio = FLT_MIN;	
															
 
																-	unsigned attempts = nworkers;
															
 
																+	/* Don't try to play smart until we get
															
 
																+	 * enough informations. */
															
 
																+	if (performed_total < calibration_value)
															
 
																+		return select_victim_round_robin(sched_ctx);
															
 
																-	unsigned worker = ws->rr_worker;
															
 
																-	do {
															
 
																-		if (overload_metric(worker) < 0.0f)
															
 
																+	for (worker_ctx = 0; worker_ctx < sched_ctx->nworkers; worker_ctx++)
															
 
																+	{
															
 
																+		worker = sched_ctx->workerid[worker_ctx];
															
 
																+		worker_ratio = overload_metric(worker);
															
 
																+
															
 
																+		if (worker_ratio > best_ratio)
															
 
																 		{
															
 
																-			q = ws->queue_array[worker];
															
 
																-			return q;
															
 
																-		}
															
 
																-		else {
															
 
																-			worker = (worker + 1)%nworkers;
															
 
																+			best_worker = worker;
															
 
																+			best_ratio = worker_ratio;
															
 
																 		}
															
 
																-	} while(attempts-- > 0);
															
 
																-
															
 
																-	/* take one anyway ... */
															
 
																-	q = ws->queue_array[ws->rr_worker];
															
 
																-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
															
 
																+	}
															
 
																-	return q;
															
 
																+	return best_worker;
															
 
																 }
															
 
																-#else
															
 
																-
															
 
																-/* who to steal work to ? */
															
 
																-static struct starpu_deque_jobq_s *select_victimq(work_stealing_data *ws, unsigned nworkers)
															
 
																+/**
															
 
																+ * Return the most suitable worker to whom add a task.
															
 
																+ * The number of previously processed tasks, total and local,
															
 
																+ * and the number of tasks currently awaiting to be processed
															
 
																+ * by the tasks are taken into account to select the most suitable
															
 
																+ * worker to add a task to.
															
 
																+ */
															
 
																+static unsigned select_worker_overload(struct starpu_sched_ctx *sched_ctx)
															
 
																 {
															
 
																-	struct starpu_deque_jobq_s *q;
															
 
																+	unsigned worker, worker_ctx;
															
 
																+	float  worker_ratio;
															
 
																+	unsigned best_worker = 0;
															
 
																+	float best_ratio = FLT_MAX;
															
 
																-	q = ws->queue_array[ws->rr_worker];
															
 
																+	/* Don't try to play smart until we get
															
 
																+	 * enough informations. */
															
 
																+	if (performed_total < calibration_value)
															
 
																+		return select_worker_round_robin(sched_ctx);
															
 
																-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
															
 
																+	for (worker_ctx = 0; worker_ctx < sched_ctx->nworkers; worker_ctx++)
															
 
																+	{
															
 
																+		worker = sched_ctx->workerid[worker_ctx];
															
 
																+		worker_ratio = overload_metric(sched_ctx,  worker);
															
 
																-	return q;
															
 
																-}
															
 
																+		if (worker_ratio < best_ratio)
															
 
																+		{
															
 
																+			best_worker = worker;
															
 
																+			best_ratio = worker_ratio;
															
 
																+		}
															
 
																+	}
															
 
																+	return best_worker;
															
 
																+}
															
 
																-/* when anonymous threads submit tasks, 
															
 
																- * we need to select a queue where to dispose them */
															
 
																-static struct starpu_deque_jobq_s *select_workerq(work_stealing_data *ws, unsigned nworkers)
															
 
																-{
															
 
																-	struct starpu_deque_jobq_s *q;
															
 
																+#endif /* USE_OVERLOAD */
															
 
																-	q = ws->queue_array[ws->rr_worker];
															
 
																-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
															
 
																+/**
															
 
																+ * Return a worker from which a task can be stolen.
															
 
																+ * This is a phony function used to call the right
															
 
																+ * function depending on the value of USE_OVERLOAD.
															
 
																+ */
															
 
																+static inline unsigned select_victim(struct starpu_sched_ctx *sched_ctx)
															
 
																+{
															
 
																+#ifdef USE_OVERLOAD
															
 
																+	return select_victim_overload(sched_ctx);
															
 
																+#else
															
 
																+	return select_victim_round_robin(sched_ctx);
															
 
																+#endif /* USE_OVERLOAD */
															
 
																+}
															
 
																-	return q;
															
 
																+/**
															
 
																+ * Return a worker from which a task can be stolen.
															
 
																+ * This is a phony function used to call the right
															
 
																+ * function depending on the value of USE_OVERLOAD.
															
 
																+ */
															
 
																+static inline unsigned select_worker(struct starpu_sched_ctx *sched_ctx)
															
 
																+{
															
 
																+#ifdef USE_OVERLOAD
															
 
																+	return select_worker_overload(sched_ctx);
															
 
																+#else
															
 
																+	return select_worker_round_robin(sched_ctx);
															
 
																+#endif /* USE_OVERLOAD */
															
 
																 }
															
 
																-#endif
															
 
																 #ifdef STARPU_DEVEL
															
 
																 #warning TODO rewrite ... this will not scale at all now
															
@@ -141,64 +234,73 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 
																 	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
															
 
																 	struct starpu_task *task;
															
 
																+	struct _starpu_deque_jobq *q;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																-	struct starpu_deque_jobq_s *q;
															
 
																+	STARPU_ASSERT(workerid != -1);
															
 
																 	q = ws->queue_array[workerid];
															
 
																 	PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
															
 
																-	task = _starpu_deque_pop_task(q, -1);
															
 
																-	if (task) {
															
 
																+	task = _starpu_deque_pop_task(q, workerid);
															
 
																+	if (task)
															
 
																+	{
															
 
																 		/* there was a local task */
															
 
																 		ws->performed_total++;
															
 
																 		PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
															
 
																+		q->nprocessed++;
															
 
																+		q->njobs--;
															
 
																 		return task;
															
 
																 	}
															
 
																-	
															
 
																+
															
 
																 	/* we need to steal someone's job */
															
 
																-	struct starpu_deque_jobq_s *victimq;
															
 
																-	victimq = select_victimq(ws, sched_ctx->nworkers);
															
 
																+	unsigned victim = select_victim(sched_ctx);
															
 
																+	struct _starpu_deque_jobq *victimq = ws->queue_array[victim];
															
 
																 	task = _starpu_deque_pop_task(victimq, workerid);
															
 
																-	if (task) {
															
 
																-		STARPU_TRACE_WORK_STEALING(q, victimq);
															
 
																+	if (task)
															
 
																+	{
															
 
																+		_STARPU_TRACE_WORK_STEALING(q, workerid);
															
 
																 		ws->performed_total++;
															
 
																-	}
															
 
																-	PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
															
 
																+		/* Beware : we have to increase the number of processed tasks of
															
 
																+		 * the stealer, not the victim ! */
															
 
																+		q->nprocessed++;
															
 
																+		victimq->njobs--;
															
 
																+	}
															
 
																 	return task;
															
 
																 }
															
 
																 int ws_push_task(struct starpu_task *task, unsigned sched_ctx_id)
															
 
																 {
															
 
																-	starpu_job_t j = _starpu_get_job_associated_to_task(task);
															
 
																-
															
 
																 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
															
 
																+	struct _starpu_deque_jobq *deque_queue;
															
 
																+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task); 
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																+	_STARPU_PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
															
 
																-        struct starpu_deque_jobq_s *deque_queue;
															
 
																-	deque_queue = ws->queue_array[workerid];
															
 
																+	/* If the current thread is not a worker but
															
 
																+	 * the main thread (-1), we find the better one to
															
 
																+	 * put task on its queue */
															
 
																+	if (workerid == -1)
															
 
																+		workerid = select_worker(sched_ctx);
															
 
																-        PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
															
 
																-	// XXX reuse ?
															
 
																-        //total_number_of_jobs++;
															
 
																+	deque_queue = ws->queue_array[workerid];
															
 
																-        STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																-        starpu_job_list_push_front(deque_queue->jobq, j);
															
 
																-        deque_queue->njobs++;
															
 
																-        deque_queue->nprocessed++;
															
 
																+	_STARPU_TRACE_JOB_PUSH(task, 0);
															
 
																+	_starpu_job_list_push_back(deque_queue->jobq, j);
															
 
																+	deque_queue->njobs++;
															
 
																-        PTHREAD_COND_SIGNAL(&ws->sched_cond);
															
 
																-        PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
															
 
																+	_STARPU_PTHREAD_COND_SIGNAL(&ws->sched_cond);
															
 
																+	_STARPU_PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
															
 
																-        return 0;
															
 
																+	return 0;
															
 
																 }
															
 
																 static void initialize_ws_policy_for_workers(unsigned sched_ctx_id, int *workerids,unsigned nnew_workers) 
															
@@ -213,6 +315,12 @@ static void initialize_ws_policy_for_workers(unsigned sched_ctx_id, int *workeri
 
																 	{
															
 
																 		workerid = workerids[i];
															
 
																 		ws->queue_array[workerid] = _starpu_create_deque();
															
 
																+		/**
															
 
																+		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
															
 
																+		 * we need to initialize it at -1.
															
 
																+		 */
															
 
																+		ws->queue_array[workerid]->nprocessed = -1;
															
 
																+		ws->queue_array[workerid]->njobs = 0;
															
 
																 		sched_ctx->sched_mutex[workerid] = &ws->sched_mutex;
															
 
																 		sched_ctx->sched_cond[workerid] = &ws->sched_cond;
															
@@ -226,11 +334,19 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 
																 	sched_ctx->policy_data = (void*)ws;
															
 
																 	unsigned nworkers = sched_ctx->nworkers;
															
 
																-	ws->rr_worker = 0;
															
 
																-	ws->queue_array = (struct starpu_deque_jobq_s**)malloc(STARPU_NMAXWORKERS*sizeof(struct starpu_deque_jobq_s*));
															
 
																+	ws->last_pop_worker = 0;
															
 
																+	ws->last_push_worker = 0;
															
 
																+
															
 
																+	/**
															
 
																+	 * The first WS_POP_TASK will increase PERFORMED_TOTAL though no task was actually performed yet,
															
 
																+	 * we need to initialize it at -1.
															
 
																+	 */
															
 
																+	ws->performed_total = -1;
															
 
																-	PTHREAD_MUTEX_INIT(&ws->sched_mutex, NULL);
															
 
																-	PTHREAD_COND_INIT(&ws->sched_cond, NULL);
															
 
																+	ws->queue_array = (struct starpu_deque_jobq_s**)malloc(STARPU_NMAXWORKERS*sizeof(struct _starpu_deque_jobq*));
															
 
																+
															
 
																+	_STARPU_PTHREAD_MUTEX_INIT(&ws->sched_mutex, NULL);
															
 
																+	_STARPU_PTHREAD_COND_INIT(&ws->sched_cond, NULL);
															
 
																 	unsigned workerid_ctx;
															
 
																 	int workerid;
															
@@ -238,9 +354,21 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 
																 	{
															
 
																 		workerid = sched_ctx->workerids[workerid_ctx];
															
 
																 		ws->queue_array[workerid] = _starpu_create_deque();
															
 
																+		/**
															
 
																+		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
															
 
																+		 * we need to initialize it at -1.
															
 
																+		 */
															
 
																+		ws->queue_array[workerid]->nprocessed = -1;
															
 
																+		ws->queue_array[workerid]->njobs = 0;
															
 
																 		sched_ctx->sched_mutex[workerid] = &ws->sched_mutex;
															
 
																 		sched_ctx->sched_cond[workerid] = &ws->sched_cond;
															
 
																+
															
 
																+#ifdef USE_OVERLOAD
															
 
																+		enum starpu_perf_archtype perf_arch;
															
 
																+		perf_arch = starpu_worker_get_perf_archtype(workerid);
															
 
																+		calibration_value += (unsigned int) starpu_worker_get_relative_speedup(perf_arch);
															
 
																+#endif /* USE_OVERLOAD */
															
 
																 	}
															
 
																 }
															
@@ -266,11 +394,13 @@ static void deinit_ws_policy(unsigned sched_ctx_id)
 
																 	}
															
 
																 }
															
 
																-struct starpu_sched_policy_s _starpu_sched_ws_policy = {
															
 
																+struct starpu_sched_policy _starpu_sched_ws_policy =
															
 
																+{
															
 
																 	.init_sched = initialize_ws_policy,
															
 
																 	.deinit_sched = deinit_ws_policy,
															
 
																 	.push_task = ws_push_task,
															
 
																 	.pop_task = ws_pop_task,
															
 
																+	.pre_exec_hook = NULL,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = NULL,
															
 
																 	.policy_name = "ws",