Browse Source

merge policies

Andra Hugo 13 years ago
parent
commit
d3db74a216

+ 126 - 113
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
@@ -26,11 +26,6 @@
 #include <core/perfmodel/perfmodel.h>
 #include <core/perfmodel/perfmodel.h>
 #include <starpu_parameters.h>
 #include <starpu_parameters.h>
 
 
-/* #ifdef STARPU_VERBOSE */
-/* static long int total_task_cnt = 0; */
-/* static long int ready_task_cnt = 0; */
-/* #endif */
-
 typedef struct {
 typedef struct {
 	double alpha;
 	double alpha;
 	double beta;
 	double beta;
@@ -47,19 +42,15 @@ typedef struct {
 static int count_non_ready_buffers(struct starpu_task *task, uint32_t node)
 static int count_non_ready_buffers(struct starpu_task *task, uint32_t node)
 {
 {
 	int cnt = 0;
 	int cnt = 0;
-
-	starpu_buffer_descr *descrs = task->buffers;
 	unsigned nbuffers = task->cl->nbuffers;
 	unsigned nbuffers = task->cl->nbuffers;
-
 	unsigned index;
 	unsigned index;
+
 	for (index = 0; index < nbuffers; index++)
 	for (index = 0; index < nbuffers; index++)
 	{
 	{
-		starpu_buffer_descr *descr;
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
+
+		handle = task->handles[index];
 
 
-		descr = &descrs[index];
-		handle = descr->handle;
-		
 		int is_valid;
 		int is_valid;
 		starpu_data_query_status(handle, node, NULL, &is_valid, NULL);
 		starpu_data_query_status(handle, node, NULL, &is_valid, NULL);
 
 
@@ -70,14 +61,14 @@ static int count_non_ready_buffers(struct starpu_task *task, uint32_t node)
 	return cnt;
 	return cnt;
 }
 }
 
 
-static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct starpu_fifo_taskq_s *fifo_queue, unsigned node)
+static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo_taskq *fifo_queue, unsigned node)
 {
 {
 	struct starpu_task *task = NULL, *current;
 	struct starpu_task *task = NULL, *current;
 
 
 	if (fifo_queue->ntasks == 0)
 	if (fifo_queue->ntasks == 0)
 		return NULL;
 		return NULL;
 
 
-	if (fifo_queue->ntasks > 0) 
+	if (fifo_queue->ntasks > 0)
 	{
 	{
 		fifo_queue->ntasks--;
 		fifo_queue->ntasks--;
 
 
@@ -108,12 +99,12 @@ static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct starpu_fifo_
 
 
 			current = current->prev;
 			current = current->prev;
 		}
 		}
-		
+
 		starpu_task_list_erase(&fifo_queue->taskq, task);
 		starpu_task_list_erase(&fifo_queue->taskq, task);
 
 
-		STARPU_TRACE_JOB_POP(task, 0);
+		_STARPU_TRACE_JOB_POP(task, 0);
 	}
 	}
-	
+
 	return task;
 	return task;
 }
 }
 
 
@@ -124,15 +115,15 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 	struct starpu_task *task;
 	struct starpu_task *task;
 
 
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
-
-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 
 
 	unsigned node = starpu_worker_get_memory_node(workerid);
 	unsigned node = starpu_worker_get_memory_node(workerid);
 
 
 	task = _starpu_fifo_pop_first_ready_task(fifo, node);
 	task = _starpu_fifo_pop_first_ready_task(fifo, node);
-	if (task) {
+	if (task)
+	{
 		double model = task->predicted;
 		double model = task->predicted;
-	
+
 		fifo->exp_len -= model;
 		fifo->exp_len -= model;
 		fifo->exp_start = starpu_timing_now() + model;
 		fifo->exp_start = starpu_timing_now() + model;
 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
@@ -159,12 +150,13 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 	struct starpu_task *task;
 	struct starpu_task *task;
 
 
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 
 
-	task = _starpu_fifo_pop_task(fifo, -1);
-	if (task) {
+	task = _starpu_fifo_pop_task(fifo, workerid);
+	if (task)
+	{
 		double model = task->predicted;
 		double model = task->predicted;
-	
+
 		fifo->exp_len -= model;
 		fifo->exp_len -= model;
 		fifo->exp_start = starpu_timing_now() + model;
 		fifo->exp_start = starpu_timing_now() + model;
 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
@@ -193,7 +185,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 	struct starpu_task *new_list;
 	struct starpu_task *new_list;
 
 
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
-	struct starpu_fifo_taskq_s *fifo = dt->queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 
 
 	pthread_mutex_t *sched_mutex;
 	pthread_mutex_t *sched_mutex;
 	pthread_cond_t *sched_cond;
 	pthread_cond_t *sched_cond;
@@ -207,7 +199,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 		fifo->exp_len -= model;
 		fifo->exp_len -= model;
 		fifo->exp_start = starpu_timing_now() + model;
 		fifo->exp_start = starpu_timing_now() + model;
 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
 		fifo->exp_end = fifo->exp_start + fifo->exp_len;
-	
+
 		new_list = new_list->next;
 		new_list = new_list->next;
 	}
 	}
 
 
@@ -215,13 +207,13 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 }
 }
 
 
 static
 static
-int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
+int _starpu_fifo_push_sorted_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
 {
 {
 	struct starpu_task_list *list = &fifo_queue->taskq;
 	struct starpu_task_list *list = &fifo_queue->taskq;
 
 
-	PTHREAD_MUTEX_LOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 
 
-	STARPU_TRACE_JOB_PUSH(task, 0);
+	_STARPU_TRACE_JOB_PUSH(task, 0);
 
 
 	if (list->head == NULL)
 	if (list->head == NULL)
 	{
 	{
@@ -230,7 +222,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 		task->prev = NULL;
 		task->prev = NULL;
 		task->next = NULL;
 		task->next = NULL;
 	}
 	}
-	else {
+	else
+	{
 		struct starpu_task *current = list->head;
 		struct starpu_task *current = list->head;
 		struct starpu_task *prev = NULL;
 		struct starpu_task *prev = NULL;
 
 
@@ -251,7 +244,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 			task->next = list->head;
 			task->next = list->head;
 			list->head = task;
 			list->head = task;
 		}
 		}
-		else {
+		else
+		{
 			if (current)
 			if (current)
 			{
 			{
 				/* Insert between prev and current */
 				/* Insert between prev and current */
@@ -260,7 +254,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 				task->next = current;
 				task->next = current;
 				current->prev = task;
 				current->prev = task;
 			}
 			}
-			else {
+			else
+			{
 				/* Insert at the tail of the list */
 				/* Insert at the tail of the list */
 				list->tail->next = task;
 				list->tail->next = task;
 				task->next = NULL;
 				task->next = NULL;
@@ -273,8 +268,8 @@ int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthrea
 	fifo_queue->ntasks++;
 	fifo_queue->ntasks++;
 	fifo_queue->nprocessed++;
 	fifo_queue->nprocessed++;
 
 
-	PTHREAD_COND_SIGNAL(sched_cond);
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -287,7 +282,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	/* make sure someone coule execute that task ! */
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
 	STARPU_ASSERT(best_workerid != -1);
 
 
-	struct starpu_fifo_taskq_s *fifo;
+	struct _starpu_fifo_taskq *fifo;
 	fifo = dt->queue_array[best_workerid];
 	fifo = dt->queue_array[best_workerid];
 
 
 	fifo->exp_end += predicted;
 	fifo->exp_end += predicted;
@@ -295,6 +290,8 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 
 	task->predicted = predicted;
 	task->predicted = predicted;
 
 
+	/* TODO predicted_transfer */
+
 	unsigned memory_node = starpu_worker_get_memory_node(best_workerid);
 	unsigned memory_node = starpu_worker_get_memory_node(best_workerid);
 
 
 	if (starpu_get_prefetch_flag())
 	if (starpu_get_prefetch_flag())
@@ -311,11 +308,12 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 			sched_mutex, sched_cond, task);
 			sched_mutex, sched_cond, task);
 }
 }
 
 
+/* TODO: factorize with dmda!! */
 static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
 static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
 {
 {
 	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	/* find the queue */
 	/* find the queue */
-	struct starpu_fifo_taskq_s *fifo;
+	struct _starpu_fifo_taskq *fifo;
 	unsigned worker, worker_ctx;
 	unsigned worker, worker_ctx;
 	int best = -1;
 	int best = -1;
 
 
@@ -346,7 +344,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
 			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
 			fifo->exp_end = fifo->exp_start + fifo->exp_len;
 			fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 
-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
 			{
 			{
 				/* no one on that queue may execute this task */
 				/* no one on that queue may execute this task */
 				continue;
 				continue;
@@ -359,21 +357,23 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
 			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
 
 
 			if (ntasks_best == -1
 			if (ntasks_best == -1
-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-					|| (!calibrating && local_length == -1.0) /* Not calibrating but this worker is being calibrated */
-					|| (calibrating && local_length == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-					) {
+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+			    || (!calibrating && isnan(local_length)) /* Not calibrating but this worker is being calibrated */
+			    || (calibrating && isnan(local_length) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+				)
+			{
 				ntasks_best_end = ntasks_end;
 				ntasks_best_end = ntasks_end;
 				ntasks_best = worker;
 				ntasks_best = worker;
+				best_impl = nimpl;
 			}
 			}
 
 
-			if (local_length == -1.0)
+			if (isnan(local_length))
 				/* we are calibrating, we want to speed-up calibration time
 				/* we are calibrating, we want to speed-up calibration time
 				 * so we privilege non-calibrated tasks (but still
 				 * so we privilege non-calibrated tasks (but still
 				 * greedily distribute them to avoid dumb schedules) */
 				 * greedily distribute them to avoid dumb schedules) */
 				calibrating = 1;
 				calibrating = 1;
 
 
-			if (local_length <= 0.0)
+			if (isnan(local_length) || _STARPU_IS_ZERO(local_length))
 				/* there is no prediction available for that task
 				/* there is no prediction available for that task
 				 * with that arch yet, so switch to a greedy strategy */
 				 * with that arch yet, so switch to a greedy strategy */
 				unknown = 1;
 				unknown = 1;
@@ -394,7 +394,8 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		}
 		}
 	}
 	}
 
 
-	if (unknown) {
+	if (unknown)
+	{
 		best = ntasks_best;
 		best = ntasks_best;
 		model_best = 0.0;
 		model_best = 0.0;
 	}
 	}
@@ -403,7 +404,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
 
 	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
 	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
 
 
-	 _starpu_get_job_associated_to_task(task)->nimpl = 0;//best_impl;
+	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
 
 
 	/* we should now have the best worker in variable "best" */
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
 	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
@@ -413,7 +414,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 {
 {
 	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	/* find the queue */
 	/* find the queue */
-	struct starpu_fifo_taskq_s *fifo;
+	struct _starpu_fifo_taskq *fifo;
 	unsigned worker, worker_ctx;
 	unsigned worker, worker_ctx;
 	int best = -1, best_ctx = -1;
 	int best = -1, best_ctx = -1;
 	
 	
@@ -423,13 +424,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
 
 	unsigned nworkers_ctx = starpu_get_nworkers_of_ctx(sched_ctx_id);
 	unsigned nworkers_ctx = starpu_get_nworkers_of_ctx(sched_ctx_id);
 	int *workerids = starpu_get_workers_of_ctx(sched_ctx_id);
 	int *workerids = starpu_get_workers_of_ctx(sched_ctx_id);
-	double local_task_length[nworkers_ctx];
-	double local_data_penalty[nworkers_ctx];
-	double local_power[nworkers_ctx];
-	double exp_end[nworkers_ctx];
+	double local_task_length[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double local_data_penalty[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double local_power[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 	double max_exp_end = 0.0;
 	double max_exp_end = 0.0;
 
 
-	double fitness[nworkers_ctx];
+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 
 
 	double best_exp_end = 10e240;
 	double best_exp_end = 10e240;
 	double model_best = 0.0;
 	double model_best = 0.0;
@@ -438,7 +439,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	int ntasks_best = -1;
 	int ntasks_best = -1;
 	double ntasks_best_end = 0.0;
 	double ntasks_best_end = 0.0;
 	int calibrating = 0;
 	int calibrating = 0;
-	
+
 	/* A priori, we know all estimations */
 	/* A priori, we know all estimations */
 	int unknown = 0;
 	int unknown = 0;
 
 
@@ -457,105 +458,109 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 			if (fifo->exp_end > max_exp_end)
 			if (fifo->exp_end > max_exp_end)
 				max_exp_end = fifo->exp_end;
 				max_exp_end = fifo->exp_end;
 
 
-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
 			{
 			{
 				/* no one on that queue may execute this task */
 				/* no one on that queue may execute this task */
 				continue;
 				continue;
 			}
 			}
 
 
 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-			local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
+			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
 
 
 			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
 			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],worker,nimpl);
 
 
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
-			local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
+			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
 
 
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
 
 			if (ntasks_best == -1
 			if (ntasks_best == -1
-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
-					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-					) {
+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+				)
+			{
 				ntasks_best_end = ntasks_end;
 				ntasks_best_end = ntasks_end;
 				ntasks_best = worker;
 				ntasks_best = worker;
-
+				best_impl = nimpl;
 			}
 			}
 
 
-			if (local_task_length[worker_ctx] == -1.0)
+			if (isnan(local_task_length[worker_ctx][nimpl]))
 				/* we are calibrating, we want to speed-up calibration time
 				/* we are calibrating, we want to speed-up calibration time
-			 	* so we privilege non-calibrated tasks (but still
-			 	* greedily distribute them to avoid dumb schedules) */
+				 * so we privilege non-calibrated tasks (but still
+				 * greedily distribute them to avoid dumb schedules) */
 				calibrating = 1;
 				calibrating = 1;
 
 
-			if (local_task_length[worker_ctx] <= 0.0)
+			if (isnan(local_task_length[worker_ctx][nimpl])
+					|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
 				/* there is no prediction available for that task
 				/* there is no prediction available for that task
-			 	* with that arch yet, so switch to a greedy strategy */
+				 * with that arch yet, so switch to a greedy strategy */
 				unknown = 1;
 				unknown = 1;
 
 
 			if (unknown)
 			if (unknown)
 					continue;
 					continue;
 
 
-			exp_end[worker_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx];
+			exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx][nimpl];
 
 
-			if (exp_end[worker_ctx] < best_exp_end)
+			if (exp_end[worker_ctx][nimpl] < best_exp_end)
 			{
 			{
 				/* a better solution was found */
 				/* a better solution was found */
-				best_exp_end = exp_end[worker_ctx];
+				best_exp_end = exp_end[worker_ctx][nimpl];
 				best_impl = nimpl;
 				best_impl = nimpl;
-
 			}
 			}
 
 
-			local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
-			if (local_power[worker_ctx] == -1.0)
-				local_power[worker_ctx] = 0.;
-			}	
-		}
+			local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch, nimpl);
+			if (isnan(local_power[worker_ctx][nimpl]))
+				local_power[worker_ctx][nimpl] = 0.;
 
 
-		if (unknown)
-			forced_best = ntasks_best;
+		 }
+	}
+
+	if (unknown)
+		forced_best = ntasks_best;
 
 
-		double best_fitness = -1;
+	double best_fitness = -1;
 	
 	
-		if (forced_best == -1)
+	if (forced_best == -1)
+	{
+		for (worker_ctx = 0; worker_ctx < nworkers_ctx; worker_ctx++)
 		{
 		{
-	        for (worker_ctx = 0; worker_ctx < nworkers_ctx; worker_ctx++)
-	        {
-		        worker = workerids[worker_ctx];
+			worker = workerids[worker_ctx];
+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			{	
+				if (!starpu_worker_can_execute_task(worker, task, nimpl))
+				{
+					/* no one on that queue may execute this task */
+					continue;
+				}
 
 
-			fifo = dt->queue_array[worker];
+					fifo = dt->queue_array[worker];
 	
 	
-			if (!starpu_worker_may_execute_task(worker, task, 0))
-			{
-				/* no one on that queue may execute this task */
-				continue;
-			}
-	
-			fitness[worker_ctx] = dt->alpha*(exp_end[worker_ctx] - best_exp_end) 
-					+ dt->beta*(local_data_penalty[worker_ctx])
-					+ dt->_gamma*(local_power[worker_ctx]);
+					fitness[worker_ctx][nimpl] = dt->alpha*(exp_end[worker_ctx][nimpl] - best_exp_end) 
+					+ dt->beta*(local_data_penalty[worker_ctx][nimpl])
+					+ dt->_gamma*(local_power[worker_ctx][nimpl]);
 
 
-			if (exp_end[worker_ctx] > max_exp_end)
+			if (exp_end[worker_ctx][nimpl] > max_exp_end)
 				/* This placement will make the computation
 				/* This placement will make the computation
 				 * longer, take into account the idle
 				 * longer, take into account the idle
 				 * consumption of other cpus */
 				 * consumption of other cpus */
-				fitness[worker_ctx] += dt->_gamma * dt->idle_power * (exp_end[worker_ctx] - max_exp_end) / 1000000.0;
+				fitness[worker_ctx][nimpl] += dt->_gamma * dt->idle_power * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
 
 
-			if (best == -1 || fitness[worker_ctx] < best_fitness)
+			if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
 			{
 			{
 				/* we found a better solution */
 				/* we found a better solution */
-				best_fitness = fitness[worker_ctx];
+				best_fitness = fitness[worker_ctx][nimpl];
 				best = worker;
 				best = worker;
 				best_ctx = worker_ctx;
 				best_ctx = worker_ctx;
+				best_impl = nimpl;
 
 
-	//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker], local_power[worker]);
+				//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_power[worker][nimpl]);
 			}
 			}
 		}
 		}
 	}
 	}
 
 
 	STARPU_ASSERT(forced_best != -1 || best != -1);
 	STARPU_ASSERT(forced_best != -1 || best != -1);
-	
+
 	if (forced_best != -1)
 	if (forced_best != -1)
 	{
 	{
 		/* there is no prediction available for that task
 		/* there is no prediction available for that task
@@ -565,10 +570,10 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 		model_best = 0.0;
 		model_best = 0.0;
 		//penality_best = 0.0;
 		//penality_best = 0.0;
 	}
 	}
-	else 
+	else
 	{
 	{
-		model_best = local_task_length[best];
-		//penality_best = local_data_penalty[best];
+		model_best = local_task_length[best_ctx][best_impl];
+		//penality_best = local_data_penalty[best_ctx][best_impl];
 	}
 	}
 
 
 
 
@@ -581,7 +586,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
 
 static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id)
 static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 {
-	return _dmda_push_task(task, 2, sched_ctx_id);
+	return _dmda_push_task(task, 1, sched_ctx_id);
 }
 }
 
 
 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
@@ -680,11 +685,13 @@ static void deinitialize_dmda_policy(unsigned sched_ctx_id)
 }
 }
 
 
 /* TODO: use post_exec_hook to fix the expected start */
 /* TODO: use post_exec_hook to fix the expected start */
-struct starpu_sched_policy_s _starpu_sched_dm_policy = {
+struct starpu_sched_policy _starpu_sched_dm_policy =
+{
 	.init_sched = initialize_dmda_policy,
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
-	.push_task = dm_push_task, 
+	.push_task = dm_push_task,
 	.pop_task = dmda_pop_task,
 	.pop_task = dmda_pop_task,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = dmda_pop_every_task,
 	.pop_every_task = dmda_pop_every_task,
 	.policy_name = "dm",
 	.policy_name = "dm",
@@ -692,11 +699,13 @@ struct starpu_sched_policy_s _starpu_sched_dm_policy = {
 	.init_sched_for_workers = initialize_dmda_policy_for_workers
 	.init_sched_for_workers = initialize_dmda_policy_for_workers
 };
 };
 
 
-struct starpu_sched_policy_s _starpu_sched_dmda_policy = {
+struct starpu_sched_policy _starpu_sched_dmda_policy =
+{
 	.init_sched = initialize_dmda_policy,
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
-	.push_task = dmda_push_task, 
+	.push_task = dmda_push_task,
 	.pop_task = dmda_pop_task,
 	.pop_task = dmda_pop_task,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = dmda_pop_every_task,
 	.pop_every_task = dmda_pop_every_task,
 	.policy_name = "dmda",
 	.policy_name = "dmda",
@@ -704,11 +713,13 @@ struct starpu_sched_policy_s _starpu_sched_dmda_policy = {
 	.init_sched_for_workers = initialize_dmda_policy_for_workers
 	.init_sched_for_workers = initialize_dmda_policy_for_workers
 };
 };
 
 
-struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy = {
+struct starpu_sched_policy _starpu_sched_dmda_sorted_policy =
+{
 	.init_sched = initialize_dmda_sorted_policy,
 	.init_sched = initialize_dmda_sorted_policy,
 	.deinit_sched = deinitialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
-	.push_task = dmda_push_sorted_task, 
+	.push_task = dmda_push_sorted_task,
 	.pop_task = dmda_pop_ready_task,
 	.pop_task = dmda_pop_ready_task,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = dmda_pop_every_task,
 	.pop_every_task = dmda_pop_every_task,
 	.policy_name = "dmdas",
 	.policy_name = "dmdas",
@@ -716,11 +727,13 @@ struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy = {
 	.init_sched_for_workers = initialize_dmda_policy_for_workers
 	.init_sched_for_workers = initialize_dmda_policy_for_workers
 };
 };
 
 
-struct starpu_sched_policy_s _starpu_sched_dmda_ready_policy = {
+struct starpu_sched_policy _starpu_sched_dmda_ready_policy =
+{
 	.init_sched = initialize_dmda_policy,
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
-	.push_task = dmda_push_task, 
+	.push_task = dmda_push_task,
 	.pop_task = dmda_pop_ready_task,
 	.pop_task = dmda_pop_ready_task,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = dmda_pop_every_task,
 	.pop_every_task = dmda_pop_every_task,
 	.policy_name = "dmdar",
 	.policy_name = "dmdar",

+ 48 - 42
src/sched_policies/deque_queues.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,13 +25,13 @@
 #include <errno.h>
 #include <errno.h>
 #include <common/utils.h>
 #include <common/utils.h>
 
 
-struct starpu_deque_jobq_s *_starpu_create_deque(void)
+struct _starpu_deque_jobq *_starpu_create_deque(void)
 {
 {
-	struct starpu_deque_jobq_s *deque;
-	deque = (struct starpu_deque_jobq_s *) malloc(sizeof(struct starpu_deque_jobq_s));
+	struct _starpu_deque_jobq *deque;
+	deque = (struct _starpu_deque_jobq *) malloc(sizeof(struct _starpu_deque_jobq));
 
 
 	/* note that not all mechanisms (eg. the semaphore) have to be used */
 	/* note that not all mechanisms (eg. the semaphore) have to be used */
-	deque->jobq = starpu_job_list_new();
+	deque->jobq = _starpu_job_list_new();
 	deque->njobs = 0;
 	deque->njobs = 0;
 	deque->nprocessed = 0;
 	deque->nprocessed = 0;
 
 
@@ -42,25 +42,25 @@ struct starpu_deque_jobq_s *_starpu_create_deque(void)
 	return deque;
 	return deque;
 }
 }
 
 
-void _starpu_destroy_deque(struct starpu_deque_jobq_s *deque)
+void _starpu_destroy_deque(struct _starpu_deque_jobq *deque)
 {
 {
-	starpu_job_list_delete(deque->jobq);
+	_starpu_job_list_delete(deque->jobq);
 	free(deque);
 	free(deque);
 }
 }
 
 
-unsigned _starpu_get_deque_njobs(struct starpu_deque_jobq_s *deque_queue)
+unsigned _starpu_get_deque_njobs(struct _starpu_deque_jobq *deque_queue)
 {
 {
 	return deque_queue->njobs;
 	return deque_queue->njobs;
 }
 }
 
 
-unsigned _starpu_get_deque_nprocessed(struct starpu_deque_jobq_s *deque_queue)
+int _starpu_get_deque_nprocessed(struct _starpu_deque_jobq *deque_queue)
 {
 {
 	return deque_queue->nprocessed;
 	return deque_queue->nprocessed;
 }
 }
 
 
-struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_queue, int workerid __attribute__ ((unused)))
+struct starpu_task *_starpu_deque_pop_task(struct _starpu_deque_jobq *deque_queue, int workerid)
 {
 {
-	starpu_job_t j = NULL;
+	struct _starpu_job *j = NULL;
 
 
 	if ((deque_queue->njobs == 0) && _starpu_machine_is_running())
 	if ((deque_queue->njobs == 0) && _starpu_machine_is_running())
 	{
 	{
@@ -68,66 +68,72 @@ struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_que
 	}
 	}
 
 
 	/* TODO find a task that suits workerid */
 	/* TODO find a task that suits workerid */
-	if (deque_queue->njobs > 0) 
+	for (j  = _starpu_job_list_begin(deque_queue->jobq);
+	     j != _starpu_job_list_end(deque_queue->jobq);
+	     j  = _starpu_job_list_next(j))
 	{
 	{
-		/* there is a task */
-		j = starpu_job_list_pop_front(deque_queue->jobq);
-	
+		unsigned nimpl;
 		STARPU_ASSERT(j);
 		STARPU_ASSERT(j);
-		deque_queue->njobs--;
-		
-		STARPU_TRACE_JOB_POP(j, 0);
+
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			if (starpu_worker_can_execute_task(workerid, j->task, nimpl))
+			{
+				j->nimpl = nimpl;
+				j = _starpu_job_list_pop_front(deque_queue->jobq);
+				_STARPU_TRACE_JOB_POP(j, 0);
+				return j->task;
+			}
 	}
 	}
-	
-	return j->task;
+
+	return NULL;
 }
 }
 
 
-struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_s *deque_queue, pthread_mutex_t *sched_mutex, int workerid)
+struct _starpu_job_list *_starpu_deque_pop_every_task(struct _starpu_deque_jobq *deque_queue, pthread_mutex_t *sched_mutex, int workerid)
 {
 {
-	struct starpu_job_list_s *new_list, *old_list;
+	struct _starpu_job_list *new_list, *old_list;
 
 
 	/* block until some task is available in that queue */
 	/* block until some task is available in that queue */
-	PTHREAD_MUTEX_LOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 
 
 	if (deque_queue->njobs == 0)
 	if (deque_queue->njobs == 0)
 	{
 	{
 		new_list = NULL;
 		new_list = NULL;
 	}
 	}
-	else {
+	else
+	{
 		/* there is a task */
 		/* there is a task */
 		old_list = deque_queue->jobq;
 		old_list = deque_queue->jobq;
-		new_list = starpu_job_list_new();
+		new_list = _starpu_job_list_new();
 
 
 		unsigned new_list_size = 0;
 		unsigned new_list_size = 0;
 
 
-		starpu_job_itor_t i;
-		starpu_job_t next_job;
+		struct _starpu_job *i;
+		struct _starpu_job *next_job;
 		/* note that this starts at the _head_ of the list, so we put
 		/* note that this starts at the _head_ of the list, so we put
  		 * elements at the back of the new list */
  		 * elements at the back of the new list */
-		for(i = starpu_job_list_begin(old_list);
-			i != starpu_job_list_end(old_list);
+		for(i = _starpu_job_list_begin(old_list);
+			i != _starpu_job_list_end(old_list);
 			i  = next_job)
 			i  = next_job)
 		{
 		{
-			next_job = starpu_job_list_next(i);
+			unsigned nimpl;
+			next_job = _starpu_job_list_next(i);
 
 
-			/* In case there are multiples implementations of the
- 			 * codelet for a single device, We dont really care
-			 * about the implementation used, so let's try the 
-			 * first one. */
-			if (starpu_worker_may_execute_task(workerid, i->task, 0))
+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			if (starpu_worker_can_execute_task(workerid, i->task, nimpl))
 			{
 			{
 				/* this elements can be moved into the new list */
 				/* this elements can be moved into the new list */
 				new_list_size++;
 				new_list_size++;
-				
-				starpu_job_list_erase(old_list, i);
-				starpu_job_list_push_back(new_list, i);
+
+				_starpu_job_list_erase(old_list, i);
+				_starpu_job_list_push_back(new_list, i);
+				i->nimpl = nimpl;
 			}
 			}
 		}
 		}
 
 
 		if (new_list_size == 0)
 		if (new_list_size == 0)
 		{
 		{
 			/* the new list is empty ... */
 			/* the new list is empty ... */
-			starpu_job_list_delete(new_list);
+			_starpu_job_list_delete(new_list);
 			new_list = NULL;
 			new_list = NULL;
 		}
 		}
 		else
 		else
@@ -135,8 +141,8 @@ struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_
 			deque_queue->njobs -= new_list_size;
 			deque_queue->njobs -= new_list_size;
 		}
 		}
 	}
 	}
-	
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 
 	return new_list;
 	return new_list;
 }
 }

+ 10 - 9
src/sched_policies/deque_queues.h

@@ -23,15 +23,16 @@
 #include <common/config.h>
 #include <common/config.h>
 #include <core/jobs.h>
 #include <core/jobs.h>
 
 
-struct starpu_deque_jobq_s {
+struct _starpu_deque_jobq
+{
 	/* the actual list */
 	/* the actual list */
-	starpu_job_list_t jobq;
+	struct _starpu_job_list *jobq;
 
 
 	/* the number of tasks currently in the queue */
 	/* the number of tasks currently in the queue */
 	unsigned njobs;
 	unsigned njobs;
 
 
 	/* the number of tasks that were processed */
 	/* the number of tasks that were processed */
-	unsigned nprocessed;
+	int nprocessed;
 
 
 	/* only meaningful if the queue is only used by a single worker */
 	/* only meaningful if the queue is only used by a single worker */
 	double exp_start; /* Expected start date of first task in the queue */
 	double exp_start; /* Expected start date of first task in the queue */
@@ -39,14 +40,14 @@ struct starpu_deque_jobq_s {
 	double exp_len; /* Expected duration of the set of tasks in the queue */
 	double exp_len; /* Expected duration of the set of tasks in the queue */
 };
 };
 
 
-struct starpu_deque_jobq_s *_starpu_create_deque(void);
-void _starpu_destroy_deque(struct starpu_deque_jobq_s *deque);
+struct _starpu_deque_jobq *_starpu_create_deque(void);
+void _starpu_destroy_deque(struct _starpu_deque_jobq *deque);
 
 
-struct starpu_task *_starpu_deque_pop_task(struct starpu_deque_jobq_s *deque_queue, int workerid);
-struct starpu_job_list_s *_starpu_deque_pop_every_task(struct starpu_deque_jobq_s *deque_queue, pthread_mutex_t *sched_mutex, int workerid);
+struct starpu_task *_starpu_deque_pop_task(struct _starpu_deque_jobq *deque_queue, int workerid);
+struct _starpu_job_list *_starpu_deque_pop_every_task(struct _starpu_deque_jobq *deque_queue, pthread_mutex_t *sched_mutex, int workerid);
 
 
-unsigned _starpu_get_deque_njobs(struct starpu_deque_jobq_s *deque_queue);
-unsigned _starpu_get_deque_nprocessed(struct starpu_deque_jobq_s *deque_queue);
+unsigned _starpu_get_deque_njobs(struct _starpu_deque_jobq *deque_queue);
+int _starpu_get_deque_nprocessed(struct _starpu_deque_jobq *deque_queue);
 
 
 
 
 #endif // __DEQUE_QUEUES_H__
 #endif // __DEQUE_QUEUES_H__

+ 37 - 33
src/sched_policies/detect_combined_workers.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2011, 2012       Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,12 +19,14 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <common/utils.h>
 #include <common/utils.h>
 #include <core/workers.h>
 #include <core/workers.h>
+#include <math.h>
+#include <sched_policies/detect_combined_workers.h>
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
 #include <hwloc.h>
 
 
-/* tree_t
- * ======
+/* struct _starpu_tree
+ * ==================
  * Purpose
  * Purpose
  * =======
  * =======
  * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
  * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
@@ -39,12 +42,12 @@
  * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
  * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
  */
  */
 
 
-typedef struct tree_s{
+struct _starpu_tree
+{
     hwloc_obj_t obj;
     hwloc_obj_t obj;
     unsigned nb_workers;
     unsigned nb_workers;
     int *workers;
     int *workers;
-} tree_t;
-
+};
 
 
 /* gather_trees
 /* gather_trees
  * ============
  * ============
@@ -65,7 +68,7 @@ typedef struct tree_s{
  *			Number of trees we want to combine (size of the array).
  *			Number of trees we want to combine (size of the array).
  */
  */
 
 
-static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_source_trees)
+static void gather_trees(struct _starpu_tree *target_tree, struct _starpu_tree *source_trees, unsigned nb_source_trees)
 {
 {
     unsigned tree_id, worker_id, index = 0;
     unsigned tree_id, worker_id, index = 0;
     for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
     for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
@@ -99,7 +102,7 @@ static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_
  *			Maximum size of a combined worker.
  *			Maximum size of a combined worker.
  */
  */
 
 
-static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_size, int max_size)
+static unsigned assign_multiple_trees(struct _starpu_tree *trees, unsigned nb_trees, unsigned int min_size, unsigned int max_size)
 {
 {
     unsigned short complete = 0;
     unsigned short complete = 0;
     unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
     unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
@@ -197,19 +200,19 @@ static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_
  *			Maximum size of a combined worker.
  *			Maximum size of a combined worker.
  */
  */
 
 
-static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree, int min_size, int max_size)
+static unsigned find_and_assign_combinations_with_hwloc_recursive(struct _starpu_tree *tree, unsigned int min_size, unsigned int max_size)
 {
 {
     unsigned subtree_id, nb_workers = 0;
     unsigned subtree_id, nb_workers = 0;
 
 
     hwloc_obj_t obj = tree->obj;
     hwloc_obj_t obj = tree->obj;
     int *workers = tree->workers;
     int *workers = tree->workers;
 
 
-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
     /* Is this a leaf ? (eg. a PU for hwloc) */
     /* Is this a leaf ? (eg. a PU for hwloc) */
     if (!hwloc_compare_types(config->cpu_depth, obj->depth))
     if (!hwloc_compare_types(config->cpu_depth, obj->depth))
     {
     {
-	struct starpu_worker_s *worker = obj->userdata;
+	struct _starpu_worker *worker = obj->userdata;
 
 
 	/* If this is a CPU worker add it at the beginning
 	/* If this is a CPU worker add it at the beginning
 	 * of the array , write 1 in the field nb_workers and
 	 * of the array , write 1 in the field nb_workers and
@@ -229,7 +232,7 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
     /* If there is only one child, we go to the next level right away */
     /* If there is only one child, we go to the next level right away */
     if (obj->arity == 1)
     if (obj->arity == 1)
     {
     {
-	tree_t subtree = *tree;
+	struct _starpu_tree subtree = *tree;
 	subtree.obj = obj->children[0];
 	subtree.obj = obj->children[0];
 	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
 	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
 	tree->nb_workers = nb_workers;
 	tree->nb_workers = nb_workers;
@@ -240,12 +243,12 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
      * CPU leaves that fits between min and max. */
      * CPU leaves that fits between min and max. */
 
 
     /* We allocate an array of tree structures which will contain the current node's subtrees data */
     /* We allocate an array of tree structures which will contain the current node's subtrees data */
-    tree_t *subtrees = (tree_t *) malloc(obj->arity * sizeof(tree_t));
+    struct _starpu_tree *subtrees = (struct _starpu_tree *) malloc(obj->arity * sizeof(struct _starpu_tree));
 
 
     /* We allocate the array containing the workers of each subtree and initialize the fields left */
     /* We allocate the array containing the workers of each subtree and initialize the fields left */
     for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
     for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
     {
     {
-	tree_t *subtree = subtrees + subtree_id;
+	struct _starpu_tree *subtree = subtrees + subtree_id;
 
 
 	subtree->obj = obj->children[subtree_id];
 	subtree->obj = obj->children[subtree_id];
 	subtree->nb_workers = 0;
 	subtree->nb_workers = 0;
@@ -317,7 +320,7 @@ static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree,
  *			Topology of the machine : used to know the number of cpus.
  *			Topology of the machine : used to know the number of cpus.
  */
  */
 
 
-static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machine_topology_s *topology)
+static void get_min_max_sizes(unsigned int *min_size, unsigned int *max_size, struct starpu_machine_topology *topology)
 {
 {
     int _min_size, _max_size;
     int _min_size, _max_size;
 
 
@@ -330,8 +333,8 @@ static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machin
     {
     {
 
 
 	int nb_cpus = topology->nhwcpus;
 	int nb_cpus = topology->nhwcpus;
-	int sqrt_nb_cpus = sqrt(nb_cpus);
-	short exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
+	int sqrt_nb_cpus = (int)sqrt((double)nb_cpus);
+	int exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
 
 
 	    if(_min_size == -1)
 	    if(_min_size == -1)
 	    {
 	    {
@@ -373,19 +376,19 @@ static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machin
  *			to get the hwloc tree.
  *			to get the hwloc tree.
  */
  */
 
 
-static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
+static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology *topology)
 {
 {
     unsigned nb_workers;
     unsigned nb_workers;
-    int min_size, max_size;
+    unsigned int min_size, max_size;
 
 
     get_min_max_sizes(&min_size, &max_size, topology);
     get_min_max_sizes(&min_size, &max_size, topology);
 
 
     STARPU_ASSERT(min_size <= max_size);
     STARPU_ASSERT(min_size <= max_size);
 
 
-    tree_t tree;
+    struct _starpu_tree tree;
 
 
     /* Of course we start from the root */
     /* Of course we start from the root */
-    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
+    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0);
     tree.nb_workers = 0;
     tree.nb_workers = 0;
     tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
     tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
 
 
@@ -399,7 +402,7 @@ static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topolo
     {
     {
 	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
 	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
 	 * while there are enough workers to assign regarding the min_size value */
 	 * while there are enough workers to assign regarding the min_size value */
-	STARPU_ASSERT(nb_workers < max_size);
+	STARPU_ASSERT(nb_workers <= max_size);
 
 
 	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
 	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
 	STARPU_ASSERT(ret >= 0);
 	STARPU_ASSERT(ret >= 0);
@@ -410,9 +413,9 @@ static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topolo
 
 
 #else /* STARPU_HAVE_HWLOC */
 #else /* STARPU_HAVE_HWLOC */
 
 
-static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
+static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology *topology)
 {
 {
-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
     /* We put the id of all CPU workers in this array */
     /* We put the id of all CPU workers in this array */
     int cpu_workers[STARPU_NMAXWORKERS];
     int cpu_workers[STARPU_NMAXWORKERS];
@@ -440,7 +443,7 @@ static void find_and_assign_combinations_without_hwloc(struct starpu_machine_top
 
 
 		/* We register this combination */
 		/* We register this combination */
 		int ret;
 		int ret;
-		ret = starpu_combined_worker_assign_workerid(size, workerids); 
+		ret = starpu_combined_worker_assign_workerid(size, workerids);
 		STARPU_ASSERT(ret >= 0);
 		STARPU_ASSERT(ret >= 0);
 	    }
 	    }
 	}
 	}
@@ -449,9 +452,9 @@ static void find_and_assign_combinations_without_hwloc(struct starpu_machine_top
 
 
 #endif /* STARPU_HAVE_HWLOC */
 #endif /* STARPU_HAVE_HWLOC */
 
 
-static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
+static void combine_all_cpu_workers(struct starpu_machine_topology *topology)
 {
 {
-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
     int cpu_workers[STARPU_NMAXWORKERS];
     int cpu_workers[STARPU_NMAXWORKERS];
     unsigned ncpus = 0;
     unsigned ncpus = 0;
@@ -463,21 +466,22 @@ static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 	    cpu_workers[ncpus++] = i;
 	    cpu_workers[ncpus++] = i;
     }
     }
 
 
-    if (ncpus > 0)
+    for (i = 1; i <= ncpus; i++)
     {
     {
 	int ret;
 	int ret;
-	ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
+	ret = starpu_combined_worker_assign_workerid(i, cpu_workers);
 	STARPU_ASSERT(ret >= 0);
 	STARPU_ASSERT(ret >= 0);
     }
     }
 }
 }
 
 
-void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
+void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology)
 {
 {
-    struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
-    if (config->user_conf && config->user_conf->single_combined_worker > 0 || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
+    if ((config->user_conf && config->user_conf->single_combined_worker > 0) || starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER") > 0)
 	combine_all_cpu_workers(topology);
 	combine_all_cpu_workers(topology);
-    else {
+    else
+    {
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 	find_and_assign_combinations_with_hwloc(topology);
 	find_and_assign_combinations_with_hwloc(topology);
 #else
 #else

+ 21 - 0
src/sched_policies/detect_combined_workers.h

@@ -0,0 +1,21 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+/* Initialize combined workers */
+void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology);
+

+ 9 - 7
src/sched_policies/eager_central_policy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,7 +25,7 @@
 #include <sched_policies/fifo_queues.h>
 #include <sched_policies/fifo_queues.h>
 
 
 typedef struct eager_center_policy_data {
 typedef struct eager_center_policy_data {
-	struct starpu_fifo_taskq_s *fifo;
+	struct _starpu_fifo_taskq *fifo;
 	pthread_mutex_t sched_mutex;
 	pthread_mutex_t sched_mutex;
 	pthread_cond_t sched_cond;
 	pthread_cond_t sched_cond;
 } eager_center_policy_data;
 } eager_center_policy_data;
@@ -109,7 +109,7 @@ static int push_task_eager_policy(struct starpu_task *task, unsigned sched_ctx_i
 		_starpu_increment_nsubmitted_tasks_of_worker(workerid);
 		_starpu_increment_nsubmitted_tasks_of_worker(workerid);
 	}
 	}
 
 
-	struct starpu_fifo_taskq_s *fifo = data->fifo;
+	struct _starpu_fifo_taskq *fifo = data->fifo;
 	return _starpu_fifo_push_task(fifo, &data->sched_mutex, &data->sched_cond, task);
 	return _starpu_fifo_push_task(fifo, &data->sched_mutex, &data->sched_cond, task);
 }
 }
 
 
@@ -118,17 +118,17 @@ static struct starpu_task *pop_every_task_eager_policy(unsigned sched_ctx_id)
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct eager_center_policy_data *data = (struct eager_center_policy_data*)sched_ctx->policy_data;
 	struct eager_center_policy_data *data = (struct eager_center_policy_data*)sched_ctx->policy_data;
 
 
-	struct starpu_fifo_taskq_s *fifo = data->fifo;
+	static struct _starpu_fifo_taskq *fifo = data->fifo;
 	return _starpu_fifo_pop_every_task(fifo, &data->sched_mutex, starpu_worker_get_id());
 	return _starpu_fifo_pop_every_task(fifo, &data->sched_mutex, starpu_worker_get_id());
 }
 }
 
 
 static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 {
 {
-        unsigned workerid = starpu_worker_get_id();
+    unsigned workerid = starpu_worker_get_id();
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct eager_center_policy_data *data = (struct eager_center_policy_data*)sched_ctx->policy_data;
 	struct eager_center_policy_data *data = (struct eager_center_policy_data*)sched_ctx->policy_data;
 
 
-	struct starpu_fifo_taskq_s *fifo = data->fifo;
+	static struct _starpu_fifo_taskq *fifo = data->fifo;
 	struct starpu_task *task =  _starpu_fifo_pop_task(fifo, workerid);
 	struct starpu_task *task =  _starpu_fifo_pop_task(fifo, workerid);
 
 
 	if(task)
 	if(task)
@@ -144,12 +144,14 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 	return task;
 	return task;
 }
 }
 
 
-struct starpu_sched_policy_s _starpu_sched_eager_policy = {
+struct starpu_sched_policy _starpu_sched_eager_policy =
+{
 	.init_sched = initialize_eager_center_policy,
 	.init_sched = initialize_eager_center_policy,
 	.init_sched_for_workers = initialize_eager_center_policy_for_workers,
 	.init_sched_for_workers = initialize_eager_center_policy_for_workers,
 	.deinit_sched = deinitialize_eager_center_policy,
 	.deinit_sched = deinitialize_eager_center_policy,
 	.push_task = push_task_eager_policy,
 	.push_task = push_task_eager_policy,
 	.pop_task = pop_task_eager_policy,
 	.pop_task = pop_task_eager_policy,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = pop_every_task_eager_policy,
 	.pop_every_task = pop_every_task_eager_policy,
 	.policy_name = "eager",
 	.policy_name = "eager",

+ 27 - 19
src/sched_policies/eager_central_priority_policy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -32,8 +32,9 @@
 
 
 #define NPRIO_LEVELS	(MAX_LEVEL - MIN_LEVEL + 1)
 #define NPRIO_LEVELS	(MAX_LEVEL - MIN_LEVEL + 1)
 
 
-struct starpu_priority_taskq_s {
-	/* the actual lists 
+struct starpu_priority_taskq_s
+{
+	/* the actual lists
 	 *	taskq[p] is for priority [p - STARPU_MIN_PRIO] */
 	 *	taskq[p] is for priority [p - STARPU_MIN_PRIO] */
 	struct starpu_task_list taskq[NPRIO_LEVELS];
 	struct starpu_task_list taskq[NPRIO_LEVELS];
 	unsigned ntasks[NPRIO_LEVELS];
 	unsigned ntasks[NPRIO_LEVELS];
@@ -42,19 +43,19 @@ struct starpu_priority_taskq_s {
 };
 };
 
 
 typedef struct eager_central_prio_data{
 typedef struct eager_central_prio_data{
-	struct starpu_priority_taskq_s *taskq;
+	struct _starpu_priority_taskq *taskq;
 	pthread_mutex_t sched_mutex;
 	pthread_mutex_t sched_mutex;
 	pthread_cond_t sched_cond;
 	pthread_cond_t sched_cond;
 } eager_central_prio_data;
 } eager_central_prio_data;
 
 
 /*
 /*
- * Centralized queue with priorities 
+ * Centralized queue with priorities
  */
  */
 
 
 static struct starpu_priority_taskq_s *_starpu_create_priority_taskq(void)
 static struct starpu_priority_taskq_s *_starpu_create_priority_taskq(void)
 {
 {
 	struct starpu_priority_taskq_s *central_queue;
 	struct starpu_priority_taskq_s *central_queue;
-	
+
 	central_queue = (struct starpu_priority_taskq_s *) malloc(sizeof(struct starpu_priority_taskq_s));
 	central_queue = (struct starpu_priority_taskq_s *) malloc(sizeof(struct starpu_priority_taskq_s));
 	central_queue->total_ntasks = 0;
 	central_queue->total_ntasks = 0;
 
 
@@ -152,24 +153,25 @@ static int _starpu_priority_push_task(struct starpu_task *task, unsigned sched_c
 	struct starpu_priority_taskq_s *taskq = data->taskq;
 	struct starpu_priority_taskq_s *taskq = data->taskq;
 
 
 	/* wake people waiting for a task */
 	/* wake people waiting for a task */
-	PTHREAD_MUTEX_LOCK(&data->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&data->sched_mutex);
+
+	_STARPU_TRACE_JOB_PUSH(task, 1);
 
 
-	STARPU_TRACE_JOB_PUSH(task, 1);
-	
 	unsigned priolevel = task->priority - STARPU_MIN_PRIO;
 	unsigned priolevel = task->priority - STARPU_MIN_PRIO;
 
 
 	starpu_task_list_push_front(&taskq->taskq[priolevel], task);
 	starpu_task_list_push_front(&taskq->taskq[priolevel], task);
 	taskq->ntasks[priolevel]++;
 	taskq->ntasks[priolevel]++;
 	taskq->total_ntasks++;
 	taskq->total_ntasks++;
 
 
-	PTHREAD_COND_SIGNAL(&data->sched_cond);
-	PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&data->sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
 static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 {
 {
+	/* XXX FIXME: should call starpu_worker_can_execute_task!! */
 	struct starpu_task *task = NULL;
 	struct starpu_task *task = NULL;
 
 
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
@@ -183,39 +185,45 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 	if ((taskq->total_ntasks == 0) && _starpu_machine_is_running())
 	if ((taskq->total_ntasks == 0) && _starpu_machine_is_running())
 	{
 	{
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 #ifdef STARPU_NON_BLOCKING_DRIVERS
-		PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
 		return NULL;
 		return NULL;
 #else
 #else
-		PTHREAD_COND_WAIT(&data->sched_cond, &data->sched_mutex);
+		_STARPU_PTHREAD_COND_WAIT(&data->sched_cond, &data->sched_mutex);
 #endif
 #endif
 	}
 	}
 
 
 	if (taskq->total_ntasks > 0)
 	if (taskq->total_ntasks > 0)
 	{
 	{
 		unsigned priolevel = NPRIO_LEVELS - 1;
 		unsigned priolevel = NPRIO_LEVELS - 1;
-		do {
-			if (taskq->ntasks[priolevel] > 0) {
+		do
+		{
+			if (taskq->ntasks[priolevel] > 0)
+			{
 				/* there is some task that we can grab */
 				/* there is some task that we can grab */
 				task = starpu_task_list_pop_back(&taskq->taskq[priolevel]);
 				task = starpu_task_list_pop_back(&taskq->taskq[priolevel]);
 				taskq->ntasks[priolevel]--;
 				taskq->ntasks[priolevel]--;
 				taskq->total_ntasks--;
 				taskq->total_ntasks--;
-				STARPU_TRACE_JOB_POP(task, 0);
+				_STARPU_TRACE_JOB_POP(task, 0);
 			}
 			}
-		} while (!task && priolevel-- > 0);
+		}
+		while (!task && priolevel-- > 0);
 	}
 	}
+	STARPU_ASSERT_MSG(starpu_worker_can_execute_task(starpu_worker_get_id(), task, 0), "prio does not support \"can_execute\"");
 
 
-	PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
 
 
 	return task;
 	return task;
 }
 }
 
 
-struct starpu_sched_policy_s _starpu_sched_prio_policy = {
+struct starpu_sched_policy _starpu_sched_prio_policy =
+{
 	.init_sched = initialize_eager_center_priority_policy,
 	.init_sched = initialize_eager_center_priority_policy,
 	.init_sched_for_workers = initialize_eager_center_priority_policy_for_workers,
 	.init_sched_for_workers = initialize_eager_center_priority_policy_for_workers,
 	.deinit_sched = deinitialize_eager_center_priority_policy,
 	.deinit_sched = deinitialize_eager_center_priority_policy,
 	/* we always use priorities in that policy */
 	/* we always use priorities in that policy */
 	.push_task = _starpu_priority_push_task,
 	.push_task = _starpu_priority_push_task,
 	.pop_task = _starpu_priority_pop_task,
 	.pop_task = _starpu_priority_pop_task,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,
 	.pop_every_task = NULL,
 	.policy_name = "prio",
 	.policy_name = "prio",

+ 48 - 35
src/sched_policies/fifo_queues.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,10 +25,10 @@
 #include <core/task.h>
 #include <core/task.h>
 #include <core/workers.h>
 #include <core/workers.h>
 
 
-struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
+struct _starpu_fifo_taskq *_starpu_create_fifo(void)
 {
 {
-	struct starpu_fifo_taskq_s *fifo;
-	fifo = (struct starpu_fifo_taskq_s *) malloc(sizeof(struct starpu_fifo_taskq_s));
+	struct _starpu_fifo_taskq *fifo;
+	fifo = (struct _starpu_fifo_taskq *) malloc(sizeof(struct _starpu_fifo_taskq));
 
 
 	/* note that not all mechanisms (eg. the semaphore) have to be used */
 	/* note that not all mechanisms (eg. the semaphore) have to be used */
 	starpu_task_list_init(&fifo->taskq);
 	starpu_task_list_init(&fifo->taskq);
@@ -42,65 +42,73 @@ struct starpu_fifo_taskq_s *_starpu_create_fifo(void)
 	return fifo;
 	return fifo;
 }
 }
 
 
-void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo)
+void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo)
 {
 {
 	free(fifo);
 	free(fifo);
 }
 }
 
 
-/* TODO: revert front/back? */
+int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo)
+{
+	return fifo->ntasks == 0;
+}
 
 
-int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
+/* TODO: revert front/back? */
+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
 {
 {
-	PTHREAD_MUTEX_LOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 
 
-	STARPU_TRACE_JOB_PUSH(task, 0);
+	_STARPU_TRACE_JOB_PUSH(task, 0);
 	/* TODO: if prio, put at back */
 	/* TODO: if prio, put at back */
 	starpu_task_list_push_front(&fifo_queue->taskq, task);
 	starpu_task_list_push_front(&fifo_queue->taskq, task);
 	fifo_queue->ntasks++;
 	fifo_queue->ntasks++;
 	fifo_queue->nprocessed++;
 	fifo_queue->nprocessed++;
 
 
-	PTHREAD_COND_SIGNAL(sched_cond);
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
-struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo_queue, int workerid __attribute__ ((unused)))
+struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue, int workerid)
 {
 {
-	struct starpu_task *task = NULL;
+	struct starpu_task *task;
 
 
-	if (fifo_queue->ntasks == 0)
-		return NULL;
-
-	/* TODO: find a task that suits workerid */
-	if (fifo_queue->ntasks > 0) 
+	for (task  = starpu_task_list_begin(&fifo_queue->taskq);
+	     task != starpu_task_list_end(&fifo_queue->taskq);
+	     task  = starpu_task_list_next(task))
 	{
 	{
-		/* there is a task */
-		task = starpu_task_list_pop_back(&fifo_queue->taskq);
-	
+		unsigned nimpl;
 		STARPU_ASSERT(task);
 		STARPU_ASSERT(task);
-		fifo_queue->ntasks--;
-		
-		STARPU_TRACE_JOB_POP(task, 0);
+
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			if (starpu_worker_can_execute_task(workerid, task, nimpl))
+			{
+				_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
+				starpu_task_list_erase(&fifo_queue->taskq, task);
+				fifo_queue->ntasks--;
+				_STARPU_TRACE_JOB_POP(task, 0);
+				return task;
+			}
 	}
 	}
-	
-	return task;
+
+	return NULL;
 }
 }
 
 
 /* pop every task that can be executed on the calling driver */
 /* pop every task that can be executed on the calling driver */
-struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, int workerid)
+struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, int workerid)
 {
 {
 	struct starpu_task_list *old_list;
 	struct starpu_task_list *old_list;
 	unsigned size;
 	unsigned size;
 
 
 	struct starpu_task *new_list = NULL;
 	struct starpu_task *new_list = NULL;
 	struct starpu_task *new_list_tail = NULL;
 	struct starpu_task *new_list_tail = NULL;
-	
-	PTHREAD_MUTEX_LOCK(sched_mutex);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 
 
 	size = fifo_queue->ntasks;
 	size = fifo_queue->ntasks;
 
 
-	if (size > 0) {
+	if (size > 0)
+	{
 		old_list = &fifo_queue->taskq;
 		old_list = &fifo_queue->taskq;
 		unsigned new_list_size = 0;
 		unsigned new_list_size = 0;
 
 
@@ -110,13 +118,15 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 		task = starpu_task_list_front(old_list);
 		task = starpu_task_list_front(old_list);
 		while (task)
 		while (task)
 		{
 		{
+			unsigned nimpl;
 			next_task = task->next;
 			next_task = task->next;
 
 
-			if (starpu_worker_may_execute_task(workerid, task, 0))
+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			if (starpu_worker_can_execute_task(workerid, task, nimpl))
 			{
 			{
 				/* this elements can be moved into the new list */
 				/* this elements can be moved into the new list */
 				new_list_size++;
 				new_list_size++;
-				
+
 				starpu_task_list_erase(old_list, task);
 				starpu_task_list_erase(old_list, task);
 
 
 				if (new_list_tail)
 				if (new_list_tail)
@@ -126,21 +136,24 @@ struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo
 					task->next = NULL;
 					task->next = NULL;
 					new_list_tail = task;
 					new_list_tail = task;
 				}
 				}
-				else {
+				else
+				{
 					new_list = task;
 					new_list = task;
 					new_list_tail = task;
 					new_list_tail = task;
 					task->prev = NULL;
 					task->prev = NULL;
 					task->next = NULL;
 					task->next = NULL;
 				}
 				}
+				_starpu_get_job_associated_to_task(task)->nimpl = nimpl;
+				break;
 			}
 			}
-		
+
 			task = next_task;
 			task = next_task;
 		}
 		}
 
 
 		fifo_queue->ntasks -= new_list_size;
 		fifo_queue->ntasks -= new_list_size;
 	}
 	}
 
 
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 
 	return new_list;
 	return new_list;
 }
 }

+ 9 - 6
src/sched_policies/fifo_queues.h

@@ -22,7 +22,8 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <common/config.h>
 
 
-struct starpu_fifo_taskq_s {
+struct _starpu_fifo_taskq
+{
 	/* the actual list */
 	/* the actual list */
 	struct starpu_task_list taskq;
 	struct starpu_task_list taskq;
 
 
@@ -38,12 +39,14 @@ struct starpu_fifo_taskq_s {
 	double exp_len; /* Expected duration of the set of tasks in the queue */
 	double exp_len; /* Expected duration of the set of tasks in the queue */
 };
 };
 
 
-struct starpu_fifo_taskq_s*_starpu_create_fifo(void);
-void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo);
+struct _starpu_fifo_taskq*_starpu_create_fifo(void);
+void _starpu_destroy_fifo(struct _starpu_fifo_taskq *fifo);
 
 
-int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
+int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
 
 
-struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo, int workerid);
-struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, int workerid);
+int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
+
+struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);
+struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, int workerid);
 
 
 #endif // __FIFO_QUEUES_H__
 #endif // __FIFO_QUEUES_H__

+ 244 - 116
src/sched_policies/heft.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
@@ -23,10 +23,25 @@
 
 
 #include <core/workers.h>
 #include <core/workers.h>
 #include <core/perfmodel/perfmodel.h>
 #include <core/perfmodel/perfmodel.h>
+#include <core/task_bundle.h>
+#include <core/workers.h>
 #include <starpu_parameters.h>
 #include <starpu_parameters.h>
 #include <starpu_task_bundle.h>
 #include <starpu_task_bundle.h>
 #include <starpu_top.h>
 #include <starpu_top.h>
 
 
+#ifndef DBL_MIN
+#define DBL_MIN __DBL_MIN__
+#endif
+
+#ifndef DBL_MAX
+#define DBL_MAX __DBL_MAX__
+#endif
+
+static double exp_start[STARPU_NMAXWORKERS]; /* of the first queued task */
+static double exp_end[STARPU_NMAXWORKERS];   /* of the set of queued tasks */
+static double exp_len[STARPU_NMAXWORKERS];   /* of the last queued task */
+static double ntasks[STARPU_NMAXWORKERS];
+
 typedef struct {
 typedef struct {
 	double alpha;
 	double alpha;
 	double beta;
 	double beta;
@@ -34,12 +49,6 @@ typedef struct {
 	double idle_power;
 	double idle_power;
 } heft_data;
 } heft_data;
 
 
-static double exp_start[STARPU_NMAXWORKERS];	/* of the first queued task */
-static double exp_end[STARPU_NMAXWORKERS];	/* of the set of queued tasks */
-static double exp_len[STARPU_NMAXWORKERS];	/* of the last queued task */
-static double ntasks[STARPU_NMAXWORKERS];
-
-
 const float alpha_minimum=0;
 const float alpha_minimum=0;
 const float alpha_maximum=10.0;
 const float alpha_maximum=10.0;
 const float beta_minimum=0;
 const float beta_minimum=0;
@@ -49,7 +58,8 @@ const float gamma_maximum=10000.0;
 const float idle_power_minimum=0;
 const float idle_power_minimum=0;
 const float idle_power_maximum=10000.0;
 const float idle_power_maximum=10000.0;
 
 
-void param_modified(struct starputop_param_t* d){
+static void param_modified(struct starpu_top_param* d)
+{
 	//just to show parameter modification
 	//just to show parameter modification
 	fprintf(stderr,"%s has been modified : %f !\n", d->name, d->value);
 	fprintf(stderr,"%s has been modified : %f !\n", d->name, d->value);
 }
 }
@@ -125,13 +135,16 @@ static void heft_init(unsigned sched_ctx_id)
 	starputop_register_parameter_float("HEFT_IDLE_POWER", &hd->idle_power, idle_power_minimum,idle_power_maximum,param_modified);
 	starputop_register_parameter_float("HEFT_IDLE_POWER", &hd->idle_power, idle_power_minimum,idle_power_maximum,param_modified);
 }
 }
 
 
-static void heft_post_exec_hook(struct starpu_task *task)
+
+/* heft_pre_exec_hook is called right after the data transfer is done and right before
+ * the computation to begin, it is useful to update more precisely the value
+ * of the expected start, end, length, etc... */
+static void heft_pre_exec_hook(struct starpu_task *task)
 {
 {
 	unsigned sched_ctx_id = task->sched_ctx;
 	unsigned sched_ctx_id = task->sched_ctx;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
-	STARPU_ASSERT(workerid >= 0);
-
 	double model = task->predicted;
 	double model = task->predicted;
+	double transfer_model = task->predicted_transfer;
 
 
 	pthread_mutex_t *sched_mutex;
 	pthread_mutex_t *sched_mutex;
 	pthread_cond_t *sched_cond;
 	pthread_cond_t *sched_cond;
@@ -144,18 +157,21 @@ static void heft_post_exec_hook(struct starpu_task *task)
 		sched_cond = &workerarg->sched_cond;
 		sched_cond = &workerarg->sched_cond;
 		starpu_worker_set_sched_condition(sched_ctx_id, workerid, sched_mutex, sched_cond);
 		starpu_worker_set_sched_condition(sched_ctx_id, workerid, sched_mutex, sched_cond);
 	}
 	}
-#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
-	starpu_call_poped_task_cb(workerid, sched_ctx_id, task->flops);
-#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
-
-	/* Once we have executed the task, we can update the predicted amount
+	/* Once the task is executing, we can update the predicted amount
 	 * of work. */
 	 * of work. */
-	PTHREAD_MUTEX_LOCK(sched_mutex);
-	exp_len[workerid] -= model;
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+	exp_len[workerid] -= model + transfer_model;
 	exp_start[workerid] = starpu_timing_now() + model;
 	exp_start[workerid] = starpu_timing_now() + model;
 	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
 	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
 	ntasks[workerid]--;
 	ntasks[workerid]--;
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+}
+
+static void heft_post_exec_hook(struct starpu_task *task)
+{
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	starpu_call_poped_task_cb(workerid, sched_ctx_id, task->flops);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 }
 }
 
 
 static void heft_push_task_notify(struct starpu_task *task, int workerid)
 static void heft_push_task_notify(struct starpu_task *task, int workerid)
@@ -163,10 +179,12 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 	unsigned sched_ctx_id = task->sched_ctx;
 	unsigned sched_ctx_id = task->sched_ctx;
 	/* Compute the expected penality */
 	/* Compute the expected penality */
 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
+	unsigned memory_node = starpu_worker_get_memory_node(workerid);
 
 
 	double predicted = starpu_task_expected_length(task, perf_arch,
 	double predicted = starpu_task_expected_length(task, perf_arch,
 			_starpu_get_job_associated_to_task(task)->nimpl);
 			_starpu_get_job_associated_to_task(task)->nimpl);
 
 
+	double predicted_transfer = starpu_task_expected_data_transfer_time(memory_node, task);
 	pthread_mutex_t *sched_mutex;
 	pthread_mutex_t *sched_mutex;
 	pthread_cond_t *sched_cond;
 	pthread_cond_t *sched_cond;
 	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
 	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
@@ -184,25 +202,45 @@ static void heft_push_task_notify(struct starpu_task *task, int workerid)
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
 
 	/* Update the predictions */
 	/* Update the predictions */
-	PTHREAD_MUTEX_LOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 	/* Sometimes workers didn't take the tasks as early as we expected */
 	/* Sometimes workers didn't take the tasks as early as we expected */
 	exp_start[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
 	exp_start[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
-	exp_end[workerid] = STARPU_MAX(exp_start[workerid], starpu_timing_now());
+	exp_end[workerid] = exp_start[workerid] + exp_len[workerid];
 
 
 	/* If there is no prediction available, we consider the task has a null length */
 	/* If there is no prediction available, we consider the task has a null length */
-	if (predicted != -1.0)
+	if (!isnan(predicted))
 	{
 	{
 		task->predicted = predicted;
 		task->predicted = predicted;
 		exp_end[workerid] += predicted;
 		exp_end[workerid] += predicted;
 		exp_len[workerid] += predicted;
 		exp_len[workerid] += predicted;
 	}
 	}
 
 
+	/* If there is no prediction available, we consider the task has a null length */
+	if (!isnan(predicted_transfer))
+	{
+		if (starpu_timing_now() + predicted_transfer < exp_end[workerid])
+		{
+			/* We may hope that the transfer will be finished by
+			 * the start of the task. */
+			predicted_transfer = 0;
+		}
+		else
+		{
+			/* The transfer will not be finished by then, take the
+			 * remainder into account */
+			predicted_transfer = (starpu_timing_now() + predicted_transfer) - exp_end[workerid];
+		}
+		task->predicted_transfer = predicted_transfer;
+		exp_end[workerid] += predicted_transfer;
+		exp_len[workerid] += predicted_transfer;
+	}
+
 	ntasks[workerid]++;
 	ntasks[workerid]++;
 
 
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 }
 }
 
 
-static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, int prio, unsigned sched_ctx_id)
+static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, double predicted_transfer, int prio, unsigned sched_ctx_id)
  {
  {
 	/* make sure someone coule execute that task ! */
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
 	STARPU_ASSERT(best_workerid != -1);
@@ -225,16 +263,38 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	starpu_call_pushed_task_cb(best_workerid, sched_ctx_id);
 	starpu_call_pushed_task_cb(best_workerid, sched_ctx_id);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
 
-	PTHREAD_MUTEX_LOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+
+	/* Sometimes workers didn't take the tasks as early as we expected */
+	exp_start[best_workerid] = STARPU_MAX(exp_start[best_workerid], starpu_timing_now());
+	exp_end[best_workerid] = exp_start[best_workerid] + exp_len[best_workerid];
+
 	exp_end[best_workerid] += predicted;
 	exp_end[best_workerid] += predicted;
 	exp_len[best_workerid] += predicted;
 	exp_len[best_workerid] += predicted;
+
+	if (starpu_timing_now() + predicted_transfer < exp_end[best_workerid])
+	{
+		/* We may hope that the transfer will be finished by
+		 * the start of the task. */
+		predicted_transfer = 0;
+	}
+	else
+	{
+		/* The transfer will not be finished by then, take the
+		 * remainder into account */
+		predicted_transfer = (starpu_timing_now() + predicted_transfer) - exp_end[best_workerid];
+	}
+	exp_end[best_workerid] += predicted_transfer;
+	exp_len[best_workerid] += predicted_transfer;
+
 	ntasks[best_workerid]++;
 	ntasks[best_workerid]++;
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 
 	task->predicted = predicted;
 	task->predicted = predicted;
+	task->predicted_transfer = predicted_transfer;
 
 
-	if (starpu_top_status_get())
-		starputop_task_prevision(task, best_workerid, 
+	if (_starpu_top_status_get())
+		_starpu_top_task_prevision(task, best_workerid,
 					(unsigned long long)(exp_end[best_workerid]-predicted)/1000,
 					(unsigned long long)(exp_end[best_workerid]-predicted)/1000,
 					(unsigned long long)exp_end[best_workerid]/1000);
 					(unsigned long long)exp_end[best_workerid]/1000);
 
 
@@ -244,29 +304,32 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		starpu_prefetch_task_input_on_node(task, memory_node);
 		starpu_prefetch_task_input_on_node(task, memory_node);
 	}
 	}
 
 
+
+	//_STARPU_DEBUG("Heft : pushing local task\n");
 	return starpu_push_local_task(best_workerid, task, prio);
 	return starpu_push_local_task(best_workerid, task, prio);
 }
 }
 
 
+/* TODO: factorize with dmda!! */
 static void compute_all_performance_predictions(struct starpu_task *task,
 static void compute_all_performance_predictions(struct starpu_task *task,
 					double *local_task_length, double *exp_end,
 					double *local_task_length, double *exp_end,
 					double *max_exp_endp, double *best_exp_endp,
 					double *max_exp_endp, double *best_exp_endp,
 					double *local_data_penalty,
 					double *local_data_penalty,
-					double *local_power, int *forced_best,
-					struct starpu_task_bundle *bundle,
+					double *local_power, 
+					int *forced_worker, int *forced_impl,
+					starpu_task_bundle_t bundle,
 					unsigned sched_ctx_id)
 					unsigned sched_ctx_id)
 {
 {
 	int calibrating = 0;
 	int calibrating = 0;
 	double max_exp_end = DBL_MIN;
 	double max_exp_end = DBL_MIN;
 	double best_exp_end = DBL_MAX;
 	double best_exp_end = DBL_MAX;
 	int ntasks_best = -1;
 	int ntasks_best = -1;
+	int nimpl_best = 0;
 	double ntasks_best_end = 0.0;
 	double ntasks_best_end = 0.0;
-	
+
 	/* A priori, we know all estimations */
 	/* A priori, we know all estimations */
 	int unknown = 0;
 	int unknown = 0;
-	
-	unsigned nimpl;
-	unsigned best_impl = 0;
 	unsigned worker, worker_ctx = 0;
 	unsigned worker, worker_ctx = 0;
+	unsigned nimpl;
 
 
 	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
 	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
 
 
@@ -277,101 +340,142 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 		{
 		{
 			/* Sometimes workers didn't take the tasks as early as we expected */
 			/* Sometimes workers didn't take the tasks as early as we expected */
 			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
 			exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
-			exp_end[worker_ctx] = exp_start[worker] + exp_len[worker];
-			if (exp_end[worker_ctx] > max_exp_end)
- 				max_exp_end = exp_end[worker_ctx];
+			exp_end[worker_ctx][nimpl] = exp_start[worker] + exp_len[worker];
+			if (exp_end[worker_ctx][nimpl] > max_exp_end)
+ 				max_exp_end = exp_end[worker_ctx][nimpl];
 			
 			
-			if (!starpu_worker_may_execute_task(worker, task, nimpl))
+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
 			{
 			{
 				/* no one on that queue may execute this task */
 				/* no one on that queue may execute this task */
 				continue;
 				continue;
 			}
 			}
-			
+
 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
-			
+
 			if (bundle)
 			if (bundle)
 			{
 			{
-				local_task_length[worker_ctx] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
-				local_data_penalty[worker_ctx] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
-				local_power[worker_ctx] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
+				/* TODO : conversion time */
+				local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
+				local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
+				local_power[worker_ctx][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
 			}
 			}
 			else 
 			else 
 			{
 			{
-				local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch, nimpl);
-				local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
-				local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
+				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
+				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
+				local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch, nimpl);
+				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
+				if (conversion_time > 0.0)
+					local_task_length[worker_ctx][nimpl] += conversion_time;
 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
 				//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
 			}
 			}
-			
+
 			double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
 			double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
-			
+
 			if (ntasks_best == -1
 			if (ntasks_best == -1
-			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-			    || (!calibrating && local_task_length[worker_ctx] == -1.0) /* Not calibrating but this worker is being calibrated */
-			    || (calibrating && local_task_length[worker_ctx] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-				) 
+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better worker */
+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+				)
 			{
 			{
 				ntasks_best_end = ntasks_end;
 				ntasks_best_end = ntasks_end;
 				ntasks_best = worker;
 				ntasks_best = worker;
 			}
 			}
-			
-			if (local_task_length[worker_ctx] == -1.0)
+
+			if (isnan(local_task_length[worker_ctx][nimpl]))
 				/* we are calibrating, we want to speed-up calibration time
 				/* we are calibrating, we want to speed-up calibration time
 				 * so we privilege non-calibrated tasks (but still
 				 * so we privilege non-calibrated tasks (but still
 				 * greedily distribute them to avoid dumb schedules) */
 				 * greedily distribute them to avoid dumb schedules) */
 				calibrating = 1;
 				calibrating = 1;
-			
-			if (local_task_length[worker_ctx] <= 0.0)
+
+			if (isnan(local_task_length[worker_ctx][nimpl])
+				|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
 				/* there is no prediction available for that task
 				/* there is no prediction available for that task
-				 * with that arch yet, so switch to a greedy strategy */
+				 * with that arch (yet or at all), so switch to a greedy strategy */
 				unknown = 1;
 				unknown = 1;
-			
+
 			if (unknown)
 			if (unknown)
 				continue;
 				continue;
 
 
-			exp_end[worker_ctx] = exp_start[worker] + exp_len[worker] + local_task_length[worker_ctx];
+			exp_end[worker_ctx][nimpl] = exp_start[worker] + exp_len[worker] + local_task_length[worker_ctx][nimpl];
 			
 			
-			if (exp_end[worker_ctx] < best_exp_end)
+			if (exp_end[worker_ctx][nimpl] < best_exp_end)
 			{
 			{
 				/* a better solution was found */
 				/* a better solution was found */
-				best_exp_end = exp_end[worker_ctx];
-				best_impl = nimpl;
+				best_exp_end = exp_end[worker_ctx][nimpl];
+				nimpl_best = nimpl;
 			}
 			}
-			
-			if (local_power[worker_ctx] == -1.0)
-				local_power[worker_ctx] = 0.;
+
+			if (isnan(local_power[worker_ctx][nimpl]))
+				local_power[worker_ctx][nimpl] = 0.;
+
 		}
 		}
 		worker_ctx++;
 		worker_ctx++;
 	}
 	}
 
 
-	*forced_best = unknown?ntasks_best:-1;
+	*forced_worker = unknown?ntasks_best:-1;
+	*forced_impl = unknown?nimpl_best:-1;
 
 
 	*best_exp_endp = best_exp_end;
 	*best_exp_endp = best_exp_end;
 	*max_exp_endp = max_exp_end;
 	*max_exp_endp = max_exp_end;
-	
-	/* save the best implementation */
-	//_STARPU_DEBUG("Scheduler heft: kernel (%u)\n", best_impl);
-	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
 }
 }
 
 
+static int push_conversion_tasks(struct starpu_task *task, unsigned int workerid)
+{
+	unsigned i;
+	int ret;
+	unsigned int node = starpu_worker_get_memory_node(workerid);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
+	for (i = 0; i < task->cl->nbuffers; i++)
+	{
+		struct starpu_task *conversion_task;
+		starpu_data_handle_t handle;
+
+		handle = task->handles[i];
+		if (!_starpu_handle_needs_conversion_task(handle, node))
+			continue;
+
+		conversion_task = _starpu_create_conversion_task(handle, node);
+		conversion_task->execute_on_a_specific_worker = 1;
+		conversion_task->workerid = workerid;
+		conversion_task->mf_skip = 1;
+		ret = _starpu_task_submit_conversion_task(conversion_task, workerid);
+		STARPU_ASSERT(ret == 0);
+	}
+
+	for (i = 0; i < task->cl->nbuffers; i++)
+		task->handles[i]->mf_node = node;
+
+	task->execute_on_a_specific_worker = 1;
+	task->workerid = workerid;
+	task->mf_skip= 1;
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[workerid]);
+
+	return 0;
+}
+
+
 static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
 static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
 {
 {
 	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
-	unsigned worker, worker_ctx = 0;
+	unsigned worker, nimpl, worker_ctx = 0;
 	int best = -1, best_id_ctx = -1;
 	int best = -1, best_id_ctx = -1;
-	
+	int selected_impl= -1;
+
 	/* this flag is set if the corresponding worker is selected because
 	/* this flag is set if the corresponding worker is selected because
 	   there is no performance prediction available yet */
 	   there is no performance prediction available yet */
-	int forced_best;
+	int forced_worker;
+	int forced_impl;
 	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
 	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
 
 
 	unsigned nworkers_ctx = workers->nworkers;
 	unsigned nworkers_ctx = workers->nworkers;
-	double local_task_length[nworkers_ctx];
-	double local_data_penalty[nworkers_ctx];
-	double local_power[nworkers_ctx];
-	double exp_end[nworkers_ctx];
+	double local_task_length[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double local_data_penalty[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double local_power[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 	double max_exp_end = 0.0;
 	double max_exp_end = 0.0;
 
 
 	double best_exp_end;
 	double best_exp_end;
@@ -381,93 +485,116 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	 *	and detect if there is some calibration that needs to be done.
 	 *	and detect if there is some calibration that needs to be done.
 	 */
 	 */
 
 
-	struct starpu_task_bundle *bundle = task->bundle;
-
-	if(workers->init_cursor)
-		workers->init_cursor(workers);
+	starpu_task_bundle_t bundle = task->bundle;
 
 
 	compute_all_performance_predictions(task, local_task_length, exp_end,
 	compute_all_performance_predictions(task, local_task_length, exp_end,
-					    &max_exp_end, &best_exp_end,
-					    local_data_penalty,
-					    local_power, &forced_best, bundle, sched_ctx_id);
+					&max_exp_end, &best_exp_end,
+					local_data_penalty,
+					local_power, &forced_worker, &forced_impl,
+					bundle, sched_ctx_id);
+
 	/* If there is no prediction available for that task with that arch we
 	/* If there is no prediction available for that task with that arch we
 	 * want to speed-up calibration time so we force this measurement */
 	 * want to speed-up calibration time so we force this measurement */
-	if (forced_best != -1){
-		return push_task_on_best_worker(task, forced_best, 0.0, prio, sched_ctx_id);
+	if (forced_worker != -1)
+	{
+		_starpu_get_job_associated_to_task(task)->nimpl = forced_impl;
+
+		if (_starpu_task_uses_multiformat_handles(task) && !task->mf_skip)
+		{
+			/*
+			 * Our task uses multiformat handles, which may need to be converted.
+			 */
+			push_conversion_tasks(task, forced_worker);
+			prio = 0;
+		}
+
+		return push_task_on_best_worker(task, forced_worker, 0.0, 0.0, prio, sched_ctx_id;
 	}
 	}
-	
+
 	/*
 	/*
 	 *	Determine which worker optimizes the fitness metric which is a
 	 *	Determine which worker optimizes the fitness metric which is a
 	 *	trade-off between load-balacing, data locality, and energy
 	 *	trade-off between load-balacing, data locality, and energy
 	 *	consumption.
 	 *	consumption.
 	 */
 	 */
 	
 	
-	double fitness[nworkers_ctx];
+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 	double best_fitness = -1;
 	double best_fitness = -1;
 
 
 	while(workers->has_next(workers))
 	while(workers->has_next(workers))
 	{
 	{
 		worker = workers->get_next(workers);
 		worker = workers->get_next(workers);
-		if (!starpu_worker_may_execute_task(worker, task, 0))
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 		{
-		        worker_ctx++;
-			/* no one on that queue may execute this task */
-			continue;
-		}
+			if (!starpu_worker_can_execute_task(worker, task, nimpl))
+			{
+				worker_ctx++;
+				/* no one on that queue may execute this task */
+				continue;
+			}
 
 
-		fitness[worker_ctx] = hd->alpha*(exp_end[worker_ctx] - best_exp_end) 
-				+ hd->beta*(local_data_penalty[worker_ctx])
-				+ hd->_gamma*(local_power[worker_ctx]);
 
 
-		if (exp_end[worker_ctx] > max_exp_end)
+			fitness[worker_ctx][nimpl] = hd->alpha*(exp_end[worker_ctx][nimpl] - best_exp_end) 
+						+ hd->beta*(local_data_penalty[worker_ctx][nimpl])
+						+ hd->_gamma*(local_power[worker_ctx][nimpl]);
+
+		if (exp_end[worker_ctx][nimpl] > max_exp_end)
 			/* This placement will make the computation
 			/* This placement will make the computation
 			 * longer, take into account the idle
 			 * longer, take into account the idle
 			 * consumption of other cpus */
 			 * consumption of other cpus */
-			fitness[worker_ctx] += hd->_gamma * hd->idle_power * (exp_end[worker_ctx] - max_exp_end) / 1000000.0;
+			fitness[worker_ctx][nimpl] += hd->_gamma * hd->idle_power * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
 
 
-		if (best == -1 || fitness[worker_ctx] < best_fitness)
+		if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
 		{
 		{
 			/* we found a better solution */
 			/* we found a better solution */
-			best_fitness = fitness[worker_ctx];
+			best_fitness = fitness[worker_ctx][nimpl];
 			best = worker;
 			best = worker;
 			best_id_ctx = worker_ctx;
 			best_id_ctx = worker_ctx;
+			selected_impl = nimpl;
 		}
 		}
 		worker_ctx++;
 		worker_ctx++;
 	}
 	}
 
 
 	/* By now, we must have found a solution */
 	/* By now, we must have found a solution */
 	STARPU_ASSERT(best != -1);
 	STARPU_ASSERT(best != -1);
-	
+
 	/* we should now have the best worker in variable "best" */
 	/* we should now have the best worker in variable "best" */
-	double model_best;
+	double model_best, transfer_model_best;
 
 
 	if (bundle)
 	if (bundle)
 	{
 	{
 		/* If we have a task bundle, we have computed the expected
 		/* If we have a task bundle, we have computed the expected
 		 * length for the entire bundle, but not for the task alone. */
 		 * length for the entire bundle, but not for the task alone. */
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
-		model_best = starpu_task_expected_length(task, perf_arch,
-				_starpu_get_job_associated_to_task(task)->nimpl);
+		unsigned memory_node = starpu_worker_get_memory_node(best);
+		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
+		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
 
 
 		/* Remove the task from the bundle since we have made a
 		/* Remove the task from the bundle since we have made a
 		 * decision for it, and that other tasks should not consider it
 		 * decision for it, and that other tasks should not consider it
 		 * anymore. */
 		 * anymore. */
-		PTHREAD_MUTEX_LOCK(&bundle->mutex);
-		int ret = starpu_task_bundle_remove(bundle, task);
-		
-		/* Perhaps the bundle was destroyed when removing the last
-		 * entry */
-		if (ret != 1)
-			PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+		starpu_task_bundle_remove(bundle, task);
 	}
 	}
 	else 
 	else 
 	{
 	{
-		model_best = local_task_length[best_id_ctx];
+		model_best = local_task_length[best_id_ctx][selected_impl];
+		transfer_model_best = local_data_penalty[best_id_ctx][selected_impl];
 	}
 	}
 
 
 	if(workers->init_cursor)
 	if(workers->init_cursor)
 		workers->deinit_cursor(workers);
 		workers->deinit_cursor(workers);
-	return push_task_on_best_worker(task, best, model_best, prio, sched_ctx_id);
+
+	_starpu_get_job_associated_to_task(task)->nimpl = selected_impl;
+
+	if (_starpu_task_uses_multiformat_handles(task) && !task->mf_skip)
+	{
+		/*
+		 * Our task uses multiformat handles, which may need to be converted.
+		 */
+		push_conversion_tasks(task, forced_worker);
+		prio = 0;
+	}
+
+	return push_task_on_best_worker(task, best, model_best, transfer_model_best, prio, sched_ctx_id);
 }
 }
 
 
 static int heft_push_task(struct starpu_task *task)
 static int heft_push_task(struct starpu_task *task)
@@ -510,13 +637,14 @@ static void heft_deinit(unsigned sched_ctx_id)
 	free(ht);
 	free(ht);
 }
 }
 
 
-struct starpu_sched_policy_s heft_policy = {
+struct starpu_sched_policy heft_policy = {
 	.init_sched = heft_init,
 	.init_sched = heft_init,
 	.deinit_sched = heft_deinit,
 	.deinit_sched = heft_deinit,
-	.push_task = heft_push_task, 
+	.push_task = heft_push_task,
 	.push_task_notify = heft_push_task_notify,
 	.push_task_notify = heft_push_task_notify,
 	.pop_task = NULL,
 	.pop_task = NULL,
 	.pop_every_task = NULL,
 	.pop_every_task = NULL,
+	.pre_exec_hook = heft_pre_exec_hook,
 	.post_exec_hook = heft_post_exec_hook,
 	.post_exec_hook = heft_post_exec_hook,
 	.add_workers = heft_add_workers	,
 	.add_workers = heft_add_workers	,
 	.remove_workers = heft_remove_workers,
 	.remove_workers = heft_remove_workers,

+ 22 - 17
src/sched_policies/parallel_greedy.c

@@ -19,10 +19,11 @@
 #include <core/workers.h>
 #include <core/workers.h>
 #include <sched_policies/fifo_queues.h>
 #include <sched_policies/fifo_queues.h>
 #include <common/barrier.h>
 #include <common/barrier.h>
+#include <sched_policies/detect_combined_workers.h>
 
 
 typedef struct pgreedy_data {
 typedef struct pgreedy_data {
-	struct starpu_fifo_taskq_s *fifo;
-	struct starpu_fifo_taskq_s *local_fifo[STARPU_NMAXWORKERS];
+	struct _starpu_fifo_taskq *fifo;
+	struct _starpu_fifo_taskq *local_fifo[STARPU_NMAXWORKERS];
 
 
 	int master_id[STARPU_NMAXWORKERS];
 	int master_id[STARPU_NMAXWORKERS];
 
 
@@ -60,7 +61,7 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
 
 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
 	{
 	{
-    	        workerid = sched_ctx->workerids[workerid_ctx];
+		workerid = sched_ctx->workerids[workerid_ctx];
 		
 		
 		int cnt = possible_combinations_cnt[workerid]++;
 		int cnt = possible_combinations_cnt[workerid]++;
 		possible_combinations[workerid][cnt] = workerid;
 		possible_combinations[workerid][cnt] = workerid;
@@ -94,15 +95,15 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 		}
 		}
 	}
 	}
 
 
-	PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
-	PTHREAD_COND_INIT(&data->sched_cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&data->sched_cond, NULL);
 
 
 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
 	{
 	{
 		workerid = sched_ctx->workerids[workerid_ctx];
 		workerid = sched_ctx->workerids[workerid_ctx];
 
 
-		PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
-		PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
+		_STARPU_PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
+		_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
 	}
 	}
 
 
 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
 	for (workerid_ctx = 0; workerid_ctx < nworkers_ctx; workerid_ctx++)
@@ -204,20 +205,20 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 			if (possible_combinations_size[workerid][i] > best_size)
 			if (possible_combinations_size[workerid][i] > best_size)
 			{
 			{
 				int combined_worker = possible_combinations[workerid][i];
 				int combined_worker = possible_combinations[workerid][i];
-				if (starpu_combined_worker_may_execute_task(combined_worker, task, 0))
+				if (starpu_combined_worker_can_execute_task(combined_worker, task, 0))
 				{
 				{
 					best_size = possible_combinations_size[workerid][i];
 					best_size = possible_combinations_size[workerid][i];
 					best_workerid = combined_worker;
 					best_workerid = combined_worker;
 				}
 				}
 			}
 			}
-		} 
+		}
 
 
 		/* In case nobody can execute this task, we let the master
 		/* In case nobody can execute this task, we let the master
 		 * worker take it anyway, so that it can discard it afterward.
 		 * worker take it anyway, so that it can discard it afterward.
 		 * */
 		 * */
 		if (best_workerid == -1)
 		if (best_workerid == -1)
 			return task;
 			return task;
-		
+
 		/* Is this a basic worker or a combined worker ? */
 		/* Is this a basic worker or a combined worker ? */
 		int nbasic_workers = (int)starpu_worker_get_count();
 		int nbasic_workers = (int)starpu_worker_get_count();
 		int is_basic_worker = (best_workerid < nbasic_workers);
 		int is_basic_worker = (best_workerid < nbasic_workers);
@@ -227,23 +228,24 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 			/* The master is alone */
 			/* The master is alone */
 			return task;
 			return task;
 		}
 		}
-		else {
+		else
+		{
 			/* The master needs to dispatch the task between the
 			/* The master needs to dispatch the task between the
 			 * different combined workers */
 			 * different combined workers */
-			struct starpu_combined_worker_s *combined_worker;
+			struct _starpu_combined_worker *combined_worker;
 			combined_worker = _starpu_get_combined_worker_struct(best_workerid);
 			combined_worker = _starpu_get_combined_worker_struct(best_workerid);
 			int worker_size = combined_worker->worker_size;
 			int worker_size = combined_worker->worker_size;
 			int *combined_workerid = combined_worker->combined_workerid;
 			int *combined_workerid = combined_worker->combined_workerid;
 
 
-			starpu_job_t j = _starpu_get_job_associated_to_task(task);
+			struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 			j->task_size = worker_size;
 			j->task_size = worker_size;
 			j->combined_workerid = best_workerid;
 			j->combined_workerid = best_workerid;
 			j->active_task_alias_count = 0;
 			j->active_task_alias_count = 0;
 
 
 			//fprintf(stderr, "POP -> size %d best_size %d\n", worker_size, best_size);
 			//fprintf(stderr, "POP -> size %d best_size %d\n", worker_size, best_size);
 
 
-			PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
-			PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
+			_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
+			_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
 
 
 			/* Dispatch task aliases to the different slaves */
 			/* Dispatch task aliases to the different slaves */
 			for (i = 1; i < worker_size; i++)
 			for (i = 1; i < worker_size; i++)
@@ -261,17 +263,20 @@ static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 			return master_alias;
 			return master_alias;
 		}
 		}
 	}
 	}
-	else {
+	else
+	{
 		/* The worker is a slave */
 		/* The worker is a slave */
 		return _starpu_fifo_pop_task(data->local_fifo[workerid], workerid);
 		return _starpu_fifo_pop_task(data->local_fifo[workerid], workerid);
 	}
 	}
 }
 }
 
 
-struct starpu_sched_policy_s _starpu_sched_pgreedy_policy = {
+struct starpu_sched_policy _starpu_sched_pgreedy_policy =
+{
 	.init_sched = initialize_pgreedy_policy,
 	.init_sched = initialize_pgreedy_policy,
 	.deinit_sched = deinitialize_pgreedy_policy,
 	.deinit_sched = deinitialize_pgreedy_policy,
 	.push_task = push_task_pgreedy_policy,
 	.push_task = push_task_pgreedy_policy,
 	.pop_task = pop_task_pgreedy_policy,
 	.pop_task = pop_task_pgreedy_policy,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,
 	.pop_every_task = NULL,
 	.policy_name = "pgreedy",
 	.policy_name = "pgreedy",

+ 120 - 91
src/sched_policies/parallel_heft.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2012 inria
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,10 +24,19 @@
 #include <core/perfmodel/perfmodel.h>
 #include <core/perfmodel/perfmodel.h>
 #include <starpu_parameters.h>
 #include <starpu_parameters.h>
 #include <common/barrier.h>
 #include <common/barrier.h>
+#include <sched_policies/detect_combined_workers.h>
+
+#ifndef DBL_MIN
+#define DBL_MIN __DBL_MIN__
+#endif
+
+#ifndef DBL_MAX
+#define DBL_MAX __DBL_MAX__
+#endif
 
 
 static pthread_mutex_t big_lock;
 static pthread_mutex_t big_lock;
 
 
-static unsigned  ncombinedworkers;
+static unsigned nworkers, ncombinedworkers;
 //static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
 //static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
 //static unsigned napplicable_perf_archtypes = 0;
 //static unsigned napplicable_perf_archtypes = 0;
 
 
@@ -50,17 +60,19 @@ static void parallel_heft_post_exec_hook(struct starpu_task *task, unsigned sche
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	double model = task->predicted;
 	double model = task->predicted;
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	if (model < 0.0)
+	double transfer_model = task->predicted_transfer;
+
+	if (isnan(model))
 		model = 0.0;
 		model = 0.0;
-	
+
 	/* Once we have executed the task, we can update the predicted amount
 	/* Once we have executed the task, we can update the predicted amount
 	 * of work. */
 	 * of work. */
-	PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[workerid]);
-	worker_exp_len[workerid] -= model;
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx->sched_mutex[workerid]);
+	worker_exp_len[workerid] -= model + transfer_model;
 	worker_exp_start[workerid] = starpu_timing_now();
 	worker_exp_start[workerid] = starpu_timing_now();
 	worker_exp_end[workerid] = worker_exp_start[workerid] + worker_exp_len[workerid];
 	worker_exp_end[workerid] = worker_exp_start[workerid] + worker_exp_len[workerid];
 	ntasks[workerid]--;
 	ntasks[workerid]--;
-	PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[workerid]);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx->sched_mutex[workerid]);
 }
 }
 
 
 static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double exp_end_predicted, int prio, struct starpu_sched_ctx *sched_ctx)
 static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double exp_end_predicted, int prio, struct starpu_sched_ctx *sched_ctx)
@@ -73,7 +85,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	int nbasic_workers = sched_ctx->nworkers;
 	int nbasic_workers = sched_ctx->nworkers;
 	int is_basic_worker = (best_workerid < nbasic_workers);
 	int is_basic_worker = (best_workerid < nbasic_workers);
 
 
-	unsigned memory_node; 
+	unsigned memory_node;
 	memory_node = starpu_worker_get_memory_node(best_workerid);
 	memory_node = starpu_worker_get_memory_node(best_workerid);
 
 
 	if (starpu_get_prefetch_flag())
 	if (starpu_get_prefetch_flag())
@@ -81,33 +93,37 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 
 	int ret = 0;
 	int ret = 0;
 
 
-	PTHREAD_MUTEX_LOCK(&big_lock);
+	_STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
 
 
 	if (is_basic_worker)
 	if (is_basic_worker)
 	{
 	{
 		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
 		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
-		worker_exp_len[best_workerid] += exp_end_predicted - worker_exp_end[best_workerid];
+		/* TODO */
+		task->predicted_transfer = 0;
+		worker_exp_len[best_workerid] += task->predicted;
 		worker_exp_end[best_workerid] = exp_end_predicted;
 		worker_exp_end[best_workerid] = exp_end_predicted;
 		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
 		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
-	
+
 		ntasks[best_workerid]++;
 		ntasks[best_workerid]++;
 
 
 		ret = starpu_push_local_task(best_workerid, task, prio);
 		ret = starpu_push_local_task(best_workerid, task, prio);
 	}
 	}
-	else {
+	else
+	{
 		/* This is a combined worker so we create task aliases */
 		/* This is a combined worker so we create task aliases */
-		struct starpu_combined_worker_s *combined_worker;
+		struct _starpu_combined_worker *combined_worker;
 		combined_worker = _starpu_get_combined_worker_struct(best_workerid);
 		combined_worker = _starpu_get_combined_worker_struct(best_workerid);
 		int worker_size = combined_worker->worker_size;
 		int worker_size = combined_worker->worker_size;
 		int *combined_workerid = combined_worker->combined_workerid;
 		int *combined_workerid = combined_worker->combined_workerid;
 
 
-		starpu_job_t j = _starpu_get_job_associated_to_task(task);
+		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 		j->task_size = worker_size;
 		j->task_size = worker_size;
 		j->combined_workerid = best_workerid;
 		j->combined_workerid = best_workerid;
 		j->active_task_alias_count = 0;
 		j->active_task_alias_count = 0;
+		task->predicted_transfer = 0;
 
 
-		PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
-		PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
+		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
+		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
 
 
 		int i;
 		int i;
 		for (i = 0; i < worker_size; i++)
 		for (i = 0; i < worker_size; i++)
@@ -116,31 +132,34 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 			int local_worker = combined_workerid[i];
 			int local_worker = combined_workerid[i];
 
 
 			alias->predicted = exp_end_predicted - worker_exp_end[local_worker];
 			alias->predicted = exp_end_predicted - worker_exp_end[local_worker];
-	
-			worker_exp_len[local_worker] += exp_end_predicted - worker_exp_end[local_worker];
+			/* TODO */
+			alias->predicted_transfer = 0;
+
+			worker_exp_len[local_worker] += alias->predicted;
 			worker_exp_end[local_worker] = exp_end_predicted;
 			worker_exp_end[local_worker] = exp_end_predicted;
 			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
 			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
-		
+
 			ntasks[local_worker]++;
 			ntasks[local_worker]++;
-	
+
 			ret |= starpu_push_local_task(local_worker, alias, prio);
 			ret |= starpu_push_local_task(local_worker, alias, prio);
 		}
 		}
 
 
 	}
 	}
 
 
-	PTHREAD_MUTEX_UNLOCK(&big_lock);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
 
 
 	return ret;
 	return ret;
 }
 }
 
 
-static double compute_expected_end(int workerid, double length, int nworkers)
+static double compute_expected_end(int workerid, double length)
 {
 {
 	if (workerid < (int)nworkers)
 	if (workerid < (int)nworkers)
 	{
 	{
 		/* This is a basic worker */
 		/* This is a basic worker */
 		return worker_exp_start[workerid] + worker_exp_len[workerid] + length;
 		return worker_exp_start[workerid] + worker_exp_len[workerid] + length;
 	}
 	}
-	else {
+	else
+	{
 		/* This is a combined worker, the expected end is the end for the latest worker */
 		/* This is a combined worker, the expected end is the end for the latest worker */
 		int worker_size;
 		int worker_size;
 		int *combined_workerid;
 		int *combined_workerid;
@@ -161,7 +180,7 @@ static double compute_expected_end(int workerid, double length, int nworkers)
 	}
 	}
 }
 }
 
 
-static double compute_ntasks_end(int workerid, int nworkers)
+static double compute_ntasks_end(int workerid)
 {
 {
 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
 	if (workerid < (int)nworkers)
 	if (workerid < (int)nworkers)
@@ -169,7 +188,8 @@ static double compute_ntasks_end(int workerid, int nworkers)
 		/* This is a basic worker */
 		/* This is a basic worker */
 		return ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
 		return ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
 	}
 	}
-	else {
+	else
+	{
 		/* This is a combined worker, the expected end is the end for the latest worker */
 		/* This is a combined worker, the expected end is the end for the latest worker */
 		int worker_size;
 		int worker_size;
 		int *combined_workerid;
 		int *combined_workerid;
@@ -181,7 +201,7 @@ static double compute_ntasks_end(int workerid, int nworkers)
 		for (i = 0; i < worker_size; i++)
 		for (i = 0; i < worker_size; i++)
 		{
 		{
 			/* XXX: this is actually bogus: not all pushed tasks are necessarily parallel... */
 			/* XXX: this is actually bogus: not all pushed tasks are necessarily parallel... */
-			ntasks_end = STARPU_MAX(ntasks_end, ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch));
+			ntasks_end = STARPU_MAX(ntasks_end, (int) ((double) ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch)));
 		}
 		}
 
 
 		return ntasks_end;
 		return ntasks_end;
@@ -199,22 +219,22 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 	
 	
 	/* this flag is set if the corresponding worker is selected because
 	/* this flag is set if the corresponding worker is selected because
 	   there is no performance prediction available yet */
 	   there is no performance prediction available yet */
-	int forced_best = -1, forced_best_ctx = -1;
+	int forced_best = -1, forced_best_ctx = -1, forced_nimpl = -1;
 
 
-	double local_task_length[nworkers_ctx + ncombinedworkers];
-	double local_data_penalty[nworkers_ctx + ncombinedworkers];
-	double local_power[nworkers_ctx + ncombinedworkers];
-	double local_exp_end[nworkers_ctx + ncombinedworkers];
-	double fitness[nworkers_ctx + ncombinedworkers];
+	double local_task_length[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
+	double local_data_penalty[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
+	double local_power[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
+	double local_exp_end[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
+	double fitness[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
 
 
 	double max_exp_end = 0.0;
 	double max_exp_end = 0.0;
 
 
-	int skip_worker[nworkers_ctx + ncombinedworkers];
+	int skip_worker[nworkers_ctx + ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
 
 
 	double best_exp_end = DBL_MAX;
 	double best_exp_end = DBL_MAX;
 	//double penality_best = 0.0;
 	//double penality_best = 0.0;
 
 
-	int ntasks_best = -1, ntasks_best_ctx = -1;
+	int ntasks_best = -1, ntasks_best_ctx = -1, nimpl_best = -1;
 	double ntasks_best_end = 0.0;
 	double ntasks_best_end = 0.0;
 	int calibrating = 0;
 	int calibrating = 0;
 
 
@@ -232,48 +252,51 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 	}
 	}
 
 
 	unsigned nimpl;
 	unsigned nimpl;
-	unsigned best_impl = 0;
 	for (worker_ctx = 0; worker_ctx < (nworkers_ctx + ncombinedworkers); worker_ctx++)
 	for (worker_ctx = 0; worker_ctx < (nworkers_ctx + ncombinedworkers); worker_ctx++)
  	{
  	{
 		worker = sched_ctx->workerids[worker_ctx];
 		worker = sched_ctx->workerids[worker_ctx];
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 		{
-			if (!starpu_combined_worker_may_execute_task(worker, task, nimpl))
+			if (!starpu_combined_worker_can_execute_task(worker, task, nimpl))
 			{
 			{
 				/* no one on that queue may execute this task */
 				/* no one on that queue may execute this task */
-				skip_worker[worker] = 1;
+				skip_worker[worker][nimpl] = 1;
 				continue;
 				continue;
 			}
 			}
-			else {
-				skip_worker[worker] = 0;
+			else
+			{
+				skip_worker[worker][nimpl] = 0;
 			}
 			}
 
 
 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 
 
-			local_task_length[worker_ctx] = starpu_task_expected_length(task, perf_arch,nimpl);
+			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
 
 
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
-			local_data_penalty[worker_ctx] = starpu_task_expected_data_transfer_time(memory_node, task);
+			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
 
 
-			double ntasks_end = compute_ntasks_end(worker, nworkers_ctx);
+			double ntasks_end = compute_ntasks_end(worker);
 
 
 			if (ntasks_best == -1
 			if (ntasks_best == -1
-					|| (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-					|| (!calibrating && local_task_length[worker] == -1.0) /* Not calibrating but this worker is being calibrated */
-					|| (calibrating && local_task_length[worker] == -1.0 && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
-					) {
+			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+					)
+			{
 				ntasks_best_end = ntasks_end;
 				ntasks_best_end = ntasks_end;
 				ntasks_best = worker;
 				ntasks_best = worker;
 				ntasks_best_ctx = worker_ctx;
 				ntasks_best_ctx = worker_ctx;
+				nimpl_best = nimpl;
 			}
 			}
 
 
-			if (local_task_length[worker_ctx] == -1.0)
+			if (isnan(local_task_length[worker_ctx][nimpl]))
 				/* we are calibrating, we want to speed-up calibration time
 				/* we are calibrating, we want to speed-up calibration time
 				 * so we privilege non-calibrated tasks (but still
 				 * so we privilege non-calibrated tasks (but still
 				 * greedily distribute them to avoid dumb schedules) */
 				 * greedily distribute them to avoid dumb schedules) */
 				calibrating = 1;
 				calibrating = 1;
 
 
-			if (local_task_length[worker_ctx] <= 0.0)
+			if (isnan(local_task_length[worker_ctx][nimpl])
+					|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
 				/* there is no prediction available for that task
 				/* there is no prediction available for that task
 				 * with that arch yet, so switch to a greedy strategy */
 				 * with that arch yet, so switch to a greedy strategy */
 				unknown = 1;
 				unknown = 1;
@@ -281,23 +304,23 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 			if (unknown)
 			if (unknown)
 				continue;
 				continue;
 
 
-			local_exp_end[worker_ctx] = compute_expected_end(worker, local_task_length[worker], nworkers_ctx);
+			local_exp_end[worker_ctx][nimpl] = compute_expected_end(worker, local_task_length[worker_ctx][nimpl]);
 
 
-			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker_ctx][nimpl], local_exp_end[worker][nimpl]);
 
 
-			if (local_exp_end[worker_ctx] < best_exp_end)
+			if (local_exp_end[worker_ctx][nimpl] < best_exp_end)
 			{
 			{
 				/* a better solution was found */
 				/* a better solution was found */
-				best_exp_end = local_exp_end[worker_ctx];
-				best_impl = nimpl;
+				best_exp_end = local_exp_end[worker_ctx][nimpl];
+				nimpl_best = nimpl;
 			}
 			}
 
 
 
 
-			local_power[worker_ctx] = starpu_task_expected_power(task, perf_arch,nimpl);
+			local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
 			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
 			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
 
 
-			if (local_power[worker_ctx] == -1.0)
-				local_power[worker_ctx] = 0.;
+			if (isnan(local_power[worker_ctx][nimpl]))
+				local_power[worker_ctx][nimpl] = 0.;
 
 
 		} //end for
 		} //end for
 	}
 	}
@@ -306,9 +329,9 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 	{
 	{
 		forced_best = ntasks_best;
 		forced_best = ntasks_best;
 		forced_best_ctx = ntasks_best_ctx;
 		forced_best_ctx = ntasks_best_ctx;
+		forced_nimpl = nimpl_best;
 	}
 	}
 
 
-
 	double best_fitness = -1;
 	double best_fitness = -1;
 
 
 
 
@@ -320,32 +343,35 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 			worker = worker_ctx >= nworkers_ctx ? worker_ctx : 
 			worker = worker_ctx >= nworkers_ctx ? worker_ctx : 
 				sched_ctx->workerids[worker_ctx];
 				sched_ctx->workerids[worker_ctx];
 
 
-			if (skip_worker[worker_ctx])
+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			{
 			{
-				/* no one on that queue may execute this task */
-				continue;
+				if (skip_worker[worker_ctx][nimpl])
+				{
+					/* no one on that queue may execute this task */
+					continue;
+				}
+
+				fitness[worker_ctx][nimpl] = hd->alpha*(local_exp_end[worker_ctx][nimpl] - best_exp_end) 
+						+ hd->beta*(local_data_penalty[worker_ctx][nimpl])
+						+ hd->_gamma*(local_power[worker_ctx][nimpl]);
+
+				if (local_exp_end[worker_ctx][nimpl] > max_exp_end)
+					/* This placement will make the computation
+					 * longer, take into account the idle
+					 * consumption of other cpus */
+					fitness[worker_ctx][nimpl] += hd->_gamma * hd->idle_power * (local_exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
+
+				if (best == -1 || fitness[worker_ctx] < best_fitness)
+				{
+					/* we found a better solution */
+					best_fitness = fitness[worker_ctx][nimpl];
+					best = worker;
+					best_id_ctx = worker_ctx;
+					nimpl_best = nimpl;
+				}
+
+			//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker][nimpl], local_exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl]);
 			}
 			}
-	
-			fitness[worker_ctx] = hd->alpha*(local_exp_end[worker_ctx] - best_exp_end) 
-					+ hd->beta*(local_data_penalty[worker_ctx])
-					+ hd->_gamma*(local_power[worker_ctx]);
-
-			if (local_exp_end[worker_ctx] > max_exp_end)
-				/* This placement will make the computation
-				 * longer, take into account the idle
-				 * consumption of other cpus */
-				fitness[worker_ctx] += hd->_gamma * hd->idle_power * (local_exp_end[worker_ctx] - max_exp_end) / 1000000.0;
-
-			if (best == -1 || fitness[worker_ctx] < best_fitness)
-			{
-				/* we found a better solution */
-				best_fitness = fitness[worker_ctx];
-				best = worker;
-				best_id_ctx = worker_ctx;
-			}
-
-		//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker], local_exp_end[worker] - best_exp_end, local_data_penalty[worker]);
-		}
 	}
 	}
 
 
 	STARPU_ASSERT(forced_best != -1 || best != -1);
 	STARPU_ASSERT(forced_best != -1 || best != -1);
@@ -357,18 +383,19 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 		 * so we force this measurement */
 		 * so we force this measurement */
 		best = forced_best;
 		best = forced_best;
 		best_id_ctx = forced_best_ctx;
 		best_id_ctx = forced_best_ctx;
+		nimpl_best = forced_nimpl;
 		//penality_best = 0.0;
 		//penality_best = 0.0;
-		best_exp_end = local_exp_end[best_id_ctx];
+		best_exp_end = compute_expected_end(best, 0);
 	}
 	}
-	else 
+	else
 	{
 	{
-                //penality_best = local_data_penalty[best];
-		best_exp_end = local_exp_end[best_id_ctx];
+		//penality_best = local_data_penalty[best_id_ctx][nimpl_best];
+		best_exp_end = local_exp_end[best_id_ctx][nimpl_best];
 	}
 	}
 
 
 
 
-	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", best_impl);
-	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
+	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", nimpl_best);
+	_starpu_get_job_associated_to_task(task)->nimpl = nimpl_best;
 	/* we should now have the best worker in variable "best" */
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best, best_exp_end, prio, sched_ctx);
 	return push_task_on_best_worker(task, best, best_exp_end, prio, sched_ctx);
 }
 }
@@ -452,11 +479,11 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 			worker_exp_end[workerid] = worker_exp_start[workerid]; 
 			worker_exp_end[workerid] = worker_exp_start[workerid]; 
 			ntasks[workerid] = 0;
 			ntasks[workerid] = 0;
 		}
 		}
-		PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
-		PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
+		_STARPU_PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
+		_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
 	}
 	}
 
 
-	PTHREAD_MUTEX_INIT(&big_lock, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&big_lock, NULL);
 
 
 	/* We pre-compute an array of all the perfmodel archs that are applicable */
 	/* We pre-compute an array of all the perfmodel archs that are applicable */
 	unsigned total_worker_count = nworkers_ctx + ncombinedworkers;
 	unsigned total_worker_count = nworkers_ctx + ncombinedworkers;
@@ -500,12 +527,14 @@ static void parallel_heft_deinit(unsigned sched_ctx_id)
 }
 }
 
 
 /* TODO: use post_exec_hook to fix the expected start */
 /* TODO: use post_exec_hook to fix the expected start */
-struct starpu_sched_policy_s _starpu_sched_parallel_heft_policy = {
+struct starpu_sched_policy _starpu_sched_parallel_heft_policy =
+{
 	.init_sched = initialize_parallel_heft_policy,
 	.init_sched = initialize_parallel_heft_policy,
 	.init_sched_for_workers = parallel_heft_init_for_workers,
 	.init_sched_for_workers = parallel_heft_init_for_workers,
 	.deinit_sched = parallel_heft_deinit,
 	.deinit_sched = parallel_heft_deinit,
 	.push_task = parallel_heft_push_task, 
 	.push_task = parallel_heft_push_task, 
 	.pop_task = NULL,
 	.pop_task = NULL,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = parallel_heft_post_exec_hook,
 	.post_exec_hook = parallel_heft_post_exec_hook,
 	.pop_every_task = NULL,
 	.pop_every_task = NULL,
 	.policy_name = "pheft",
 	.policy_name = "pheft",

+ 7 - 4
src/sched_policies/random_policy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -52,7 +52,8 @@ static int _random_push_task(struct starpu_task *task, unsigned prio, struct sta
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 		double worker_alpha = starpu_worker_get_relative_speedup(perf_arch);
 		double worker_alpha = starpu_worker_get_relative_speedup(perf_arch);
 
 
-		if (alpha + worker_alpha > random) {
+		if (alpha + worker_alpha > random && starpu_worker_can_execute_task(worker, task, 0))
+		{
 			/* we found the worker */
 			/* we found the worker */
 			selected = worker;
 			selected = worker;
 			break;
 			break;
@@ -72,7 +73,7 @@ static int random_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 
 
-    return _random_push_task(task, 0, sched_ctx);
+    return _random_push_task(task, !!task->priority, sched_ctx);
 }
 }
 
 
 static void initialize_random_policy_for_workers(unsigned sched_ctx_id, int *workerids, unsigned nnew_workers) 
 static void initialize_random_policy_for_workers(unsigned sched_ctx_id, int *workerids, unsigned nnew_workers) 
@@ -109,12 +110,14 @@ static void initialize_random_policy(unsigned sched_ctx_id)
 	}
 	}
 }
 }
 
 
-struct starpu_sched_policy_s _starpu_sched_random_policy = {
+struct starpu_sched_policy _starpu_sched_random_policy =
+{
 	.init_sched = initialize_random_policy,
 	.init_sched = initialize_random_policy,
 	.init_sched_for_workers = initialize_random_policy_for_workers,
 	.init_sched_for_workers = initialize_random_policy_for_workers,
 	.deinit_sched = NULL,
 	.deinit_sched = NULL,
 	.push_task = random_push_task,
 	.push_task = random_push_task,
 	.pop_task = NULL,
 	.pop_task = NULL,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,
 	.pop_every_task = NULL,
 	.policy_name = "random",
 	.policy_name = "random",

+ 26 - 26
src/sched_policies/stack_queues.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,7 +22,7 @@
 #include <errno.h>
 #include <errno.h>
 #include <common/utils.h>
 #include <common/utils.h>
 
 
-/* keep track of the total number of jobs to be scheduled to avoid infinite 
+/* keep track of the total number of jobs to be scheduled to avoid infinite
  * polling when there are really few jobs in the overall queue */
  * polling when there are really few jobs in the overall queue */
 static unsigned total_number_of_jobs;
 static unsigned total_number_of_jobs;
 
 
@@ -31,12 +31,12 @@ void _starpu_init_stack_queues_mechanisms(void)
 	total_number_of_jobs = 0;
 	total_number_of_jobs = 0;
 }
 }
 
 
-struct starpu_stack_jobq_s *_starpu_create_stack(void)
+struct _starpu_stack_jobq *_starpu_create_stack(void)
 {
 {
-	struct starpu_stack_jobq_s *stack;
-	stack = (struct starpu_stack_jobq_s *) malloc(sizeof(struct starpu_stack_jobq_s));
+	struct _starpu_stack_jobq *stack;
+	stack = (struct _starpu_stack_jobq *) malloc(sizeof(struct _starpu_stack_jobq));
 
 
-	stack->jobq = starpu_job_list_new();
+	stack->jobq = _starpu_job_list_new();
 	stack->njobs = 0;
 	stack->njobs = 0;
 	stack->nprocessed = 0;
 	stack->nprocessed = 0;
 
 
@@ -47,58 +47,58 @@ struct starpu_stack_jobq_s *_starpu_create_stack(void)
 	return stack;
 	return stack;
 }
 }
 
 
-unsigned _starpu_get_stack_njobs(struct starpu_stack_jobq_s *stack_queue)
+unsigned _starpu_get_stack_njobs(struct _starpu_stack_jobq *stack_queue)
 {
 {
 	return stack_queue->njobs;
 	return stack_queue->njobs;
 }
 }
 
 
-unsigned _starpu_get_stack_nprocessed(struct starpu_stack_jobq_s *stack_queue)
+unsigned _starpu_get_stack_nprocessed(struct _starpu_stack_jobq *stack_queue)
 {
 {
 	return stack_queue->nprocessed;
 	return stack_queue->nprocessed;
 }
 }
 
 
-void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task)
+void _starpu_stack_push_task(struct _starpu_stack_jobq *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct _starpu_job *task)
 {
 {
-	PTHREAD_MUTEX_LOCK(sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 	total_number_of_jobs++;
 	total_number_of_jobs++;
 
 
-	STARPU_TRACE_JOB_PUSH(task, 0);
+	_STARPU_TRACE_JOB_PUSH(task, 0);
 	if (task->task->priority)
 	if (task->task->priority)
-		starpu_job_list_push_back(stack_queue->jobq, task);
+		_starpu_job_list_push_back(stack_queue->jobq, task);
 	else
 	else
-		starpu_job_list_push_front(stack_queue->jobq, task);
+		_starpu_job_list_push_front(stack_queue->jobq, task);
 	stack_queue->njobs++;
 	stack_queue->njobs++;
 	stack_queue->nprocessed++;
 	stack_queue->nprocessed++;
 
 
-	PTHREAD_COND_SIGNAL(sched_cond);
-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 }
 }
 
 
-starpu_job_t _starpu_stack_pop_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, int workerid __attribute__ ((unused)))
+struct _starpu_job *_starpu_stack_pop_task(struct _starpu_stack_jobq *stack_queue, pthread_mutex_t *sched_mutex, int workerid __attribute__ ((unused)))
 {
 {
-	starpu_job_t j = NULL;
+	struct _starpu_job *j = NULL;
 
 
 	if (stack_queue->njobs == 0)
 	if (stack_queue->njobs == 0)
 		return NULL;
 		return NULL;
 
 
 	/* TODO find a task that suits workerid */
 	/* TODO find a task that suits workerid */
-	if (stack_queue->njobs > 0) 
+	if (stack_queue->njobs > 0)
 	{
 	{
 		/* there is a task */
 		/* there is a task */
-		j = starpu_job_list_pop_back(stack_queue->jobq);
-	
+		j = _starpu_job_list_pop_back(stack_queue->jobq);
+
 		STARPU_ASSERT(j);
 		STARPU_ASSERT(j);
 		stack_queue->njobs--;
 		stack_queue->njobs--;
-		
-		STARPU_TRACE_JOB_POP(j, 0);
 
 
-		/* we are sure that we got it now, so at worst, some people thought 
+		_STARPU_TRACE_JOB_POP(j, 0);
+
+		/* we are sure that we got it now, so at worst, some people thought
 		 * there remained some work and will soon discover it is not true */
 		 * there remained some work and will soon discover it is not true */
-		PTHREAD_MUTEX_LOCK(sched_mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 		total_number_of_jobs--;
 		total_number_of_jobs--;
-		PTHREAD_MUTEX_UNLOCK(sched_mutex);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 	}
 	}
-	
+
 	return j;
 	return j;
 
 
 }
 }

+ 8 - 7
src/sched_policies/stack_queues.h

@@ -23,9 +23,10 @@
 #include <common/config.h>
 #include <common/config.h>
 #include <core/jobs.h>
 #include <core/jobs.h>
 
 
-struct starpu_stack_jobq_s {
+struct _starpu_stack_jobq
+{
 	/* the actual list */
 	/* the actual list */
-	starpu_job_list_t jobq;
+	struct _starpu_job_list *jobq;
 
 
 	/* the number of tasks currently in the queue */
 	/* the number of tasks currently in the queue */
 	unsigned njobs;
 	unsigned njobs;
@@ -39,17 +40,17 @@ struct starpu_stack_jobq_s {
 	double exp_len; /* Expected duration of the set of tasks in the queue */
 	double exp_len; /* Expected duration of the set of tasks in the queue */
 };
 };
 
 
-struct starpu_stack_jobq_s *_starpu_create_stack(void);
+struct _starpu_stack_jobq *_starpu_create_stack(void);
 
 
-void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task);
+void _starpu_stack_push_task(struct _starpu_stack_jobq *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct _starpu_job *task);
 
 
-starpu_job_t _starpu_stack_pop_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, int workerid);
+struct _starpu_job *_starpu_stack_pop_task(struct _starpu_stack_jobq *stack, pthread_mutex_t *sched_mutex, int workerid);
 
 
 void _starpu_init_stack_queues_mechanisms(void);
 void _starpu_init_stack_queues_mechanisms(void);
 
 
 
 
-unsigned _starpu_get_stack_njobs(struct starpu_stack_jobq_s *stack);
-unsigned _starpu_get_stack_nprocessed(struct starpu_stack_jobq_s *stack);
+unsigned _starpu_get_stack_njobs(struct _starpu_stack_jobq *stack);
+unsigned _starpu_get_stack_nprocessed(struct _starpu_stack_jobq *stack);
 
 
 
 
 #endif // __STACK_QUEUES_H__
 #endif // __STACK_QUEUES_H__

+ 227 - 97
src/sched_policies/work_stealing_policy.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  INRIA
+ * Copyright (C) 2011, 2012  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,11 +18,13 @@
 
 
 /* Work stealing policy */
 /* Work stealing policy */
 
 
+#include <float.h>
+
 #include <core/workers.h>
 #include <core/workers.h>
 #include <sched_policies/deque_queues.h>
 #include <sched_policies/deque_queues.h>
 
 
 typedef struct work_stealing_data{
 typedef struct work_stealing_data{
-	struct starpu_deque_jobq_s **queue_array;
+	struct _starpu_deque_jobq **queue_array;
 	unsigned rr_worker;
 	unsigned rr_worker;
 	/* keep track of the work performed from the beginning of the algorithm to make
 	/* keep track of the work performed from the beginning of the algorithm to make
 	 * better decisions about which queue to select when stealing or deferring work
 	 * better decisions about which queue to select when stealing or deferring work
@@ -30,107 +32,198 @@ typedef struct work_stealing_data{
 	unsigned performed_total;
 	unsigned performed_total;
 	pthread_mutex_t sched_mutex;
 	pthread_mutex_t sched_mutex;
 	pthread_cond_t sched_cond;
 	pthread_cond_t sched_cond;
+	unsigned last_pop_worker;
+static unsigned last_push_worker;
 } work_stealing_data;
 } work_stealing_data;
 
 
 #ifdef USE_OVERLOAD
 #ifdef USE_OVERLOAD
-static float overload_metric(struct starpu_deque_jobq_s *dequeue_queue, unsigned *performed_total)
+
+/**
+ * Minimum number of task we wait for being processed before we start assuming
+ * on which worker the computation would be faster.
+ */
+static int calibration_value = 0;
+
+#endif /* USE_OVERLOAD */
+
+
+/**
+ * Return a worker from which a task can be stolen.
+ * Selecting a worker is done in a round-robin fashion, unless
+ * the worker previously selected doesn't own any task,
+ * then we return the first non-empty worker.
+ */
+static unsigned select_victim_round_robin(struct starpu_sched_ctx *sched_ctx)
 {
 {
-	float execution_ratio = 0.0f;
-	if (*performed_total > 0) {
-		execution_ratio = _starpu_get_deque_nprocessed(dequeue_queue)/ *performed_total;
+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
+	unsigned worker = ws->last_pop_worker;
+
+	/* If the worker's queue is empty, let's try
+	 * the next ones */
+	while (!ws->queue_array[worker]->njobs)
+	{
+		worker = (worker + 1) % sched_ctx->nworkers;
+		if (worker == ws->last_pop_worker)
+		{
+			/* We got back to the first worker,
+			 * don't go in infinite loop */
+			break;
+		}
 	}
 	}
 
 
-	unsigned performed_queue;
-	performed_queue = _starpu_get_deque_nprocessed(dequeue_queue);
+	ws->last_pop_worker = (worker + 1) % sched_ctx->nworkers;
 
 
-	float current_ratio = 0.0f;
-	if (performed_queue > 0) {
-		current_ratio = _starpu_get_deque_njobs(dequeue_queue)/performed_queue;
-	}
-	
-	return (current_ratio - execution_ratio);
+	return worker;
 }
 }
 
 
-/* who to steal work to ? */
-static struct starpu_deque_jobq_s *select_victimq(work_stealing_data *ws, unsigned nworkers)
+/**
+ * Return a worker to whom add a task.
+ * Selecting a worker is done in a round-robin fashion.
+ */
+static unsigned select_worker_round_robin(struct starpu_sched_ctx *sched_ctx)
 {
 {
-	struct starpu_deque_jobq_s *q;
+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
+	unsigned worker = ws->last_push_worker;
 
 
-	unsigned attempts = nworkers;
+	last_push_worker = (last_push_worker + 1) % sched_ctx->nworkers;
 
 
-	unsigned worker = ws->rr_worker;
-	do {
-		if (overload_metric(worker) > 0.0f)
-		{
-			q = ws->queue_array[worker];
-			return q;
-		}
-		else {
-			worker = (worker + 1)%nworkers;
-		}
-	} while(attempts-- > 0);
+	return worker;
+}
+
+#ifdef USE_OVERLOAD
+
+/**
+ * Return a ratio helpful to determine whether a worker is suitable to steal
+ * tasks from or to put some tasks in its queue.
+ *
+ * \return	a ratio with a positive or negative value, describing the current state of the worker :
+ * 		a smaller value implies a faster worker with an relatively emptier queue : more suitable to put tasks in
+ * 		a bigger value implies a slower worker with an reletively more replete queue : more suitable to steal tasks from
+ */
+static float overload_metric(struct starpu_sched_ctx *sched_ctx, unsigned id)
+{
+	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
+	float execution_ratio = 0.0f;
+	float current_ratio = 0.0f;
+
+	int nprocessed = _starpu_get_deque_nprocessed(ws->queue_array[id]);
+	unsigned njobs = _starpu_get_deque_njobs(ws->queue_array[id]);
 
 
-	/* take one anyway ... */
-	q = ws->queue_array[ws->rr_worker];
-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
+	/* Did we get enough information ? */
+	if (performed_total > 0 && nprocessed > 0)
+	{
+		/* How fast or slow is the worker compared to the other workers */
+		execution_ratio = (float) nprocessed / performed_total;
+		/* How replete is its queue */
+		current_ratio = (float) njobs / nprocessed;
+	}
+	else
+	{
+		return 0.0f;
+	}
 
 
-	return q;
+	return (current_ratio - execution_ratio);
 }
 }
 
 
-static struct starpu_deque_jobq_s *select_workerq(work_stealing_data *ws, unsigned nworkers)
+/**
+ * Return the most suitable worker from which a task can be stolen.
+ * The number of previously processed tasks, total and local,
+ * and the number of tasks currently awaiting to be processed
+ * by the tasks are taken into account to select the most suitable
+ * worker to steal task from.
+ */
+static unsigned select_victim_overload(struct starpu_sched_ctx *sched_ctx)
 {
 {
-	struct starpu_deque_jobq_s *q;
+	unsigned worker, worker_ctx;
+	float  worker_ratio;
+	unsigned best_worker = 0;
+	float best_ratio = FLT_MIN;	
 
 
-	unsigned attempts = nworkers;
+	/* Don't try to play smart until we get
+	 * enough informations. */
+	if (performed_total < calibration_value)
+		return select_victim_round_robin(sched_ctx);
 
 
-	unsigned worker = ws->rr_worker;
-	do {
-		if (overload_metric(worker) < 0.0f)
+	for (worker_ctx = 0; worker_ctx < sched_ctx->nworkers; worker_ctx++)
+	{
+		worker = sched_ctx->workerid[worker_ctx];
+		worker_ratio = overload_metric(worker);
+
+		if (worker_ratio > best_ratio)
 		{
 		{
-			q = ws->queue_array[worker];
-			return q;
-		}
-		else {
-			worker = (worker + 1)%nworkers;
+			best_worker = worker;
+			best_ratio = worker_ratio;
 		}
 		}
-	} while(attempts-- > 0);
-
-	/* take one anyway ... */
-	q = ws->queue_array[ws->rr_worker];
-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
+	}
 
 
-	return q;
+	return best_worker;
 }
 }
 
 
-#else
-
-/* who to steal work to ? */
-static struct starpu_deque_jobq_s *select_victimq(work_stealing_data *ws, unsigned nworkers)
+/**
+ * Return the most suitable worker to whom add a task.
+ * The number of previously processed tasks, total and local,
+ * and the number of tasks currently awaiting to be processed
+ * by the tasks are taken into account to select the most suitable
+ * worker to add a task to.
+ */
+static unsigned select_worker_overload(struct starpu_sched_ctx *sched_ctx)
 {
 {
-	struct starpu_deque_jobq_s *q;
+	unsigned worker, worker_ctx;
+	float  worker_ratio;
+	unsigned best_worker = 0;
+	float best_ratio = FLT_MAX;
 
 
-	q = ws->queue_array[ws->rr_worker];
+	/* Don't try to play smart until we get
+	 * enough informations. */
+	if (performed_total < calibration_value)
+		return select_worker_round_robin(sched_ctx);
 
 
-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
+	for (worker_ctx = 0; worker_ctx < sched_ctx->nworkers; worker_ctx++)
+	{
+		worker = sched_ctx->workerid[worker_ctx];
+		worker_ratio = overload_metric(sched_ctx,  worker);
 
 
-	return q;
-}
+		if (worker_ratio < best_ratio)
+		{
+			best_worker = worker;
+			best_ratio = worker_ratio;
+		}
+	}
 
 
+	return best_worker;
+}
 
 
-/* when anonymous threads submit tasks, 
- * we need to select a queue where to dispose them */
-static struct starpu_deque_jobq_s *select_workerq(work_stealing_data *ws, unsigned nworkers)
-{
-	struct starpu_deque_jobq_s *q;
+#endif /* USE_OVERLOAD */
 
 
-	q = ws->queue_array[ws->rr_worker];
 
 
-	ws->rr_worker = (ws->rr_worker + 1 )%nworkers;
+/**
+ * Return a worker from which a task can be stolen.
+ * This is a phony function used to call the right
+ * function depending on the value of USE_OVERLOAD.
+ */
+static inline unsigned select_victim(struct starpu_sched_ctx *sched_ctx)
+{
+#ifdef USE_OVERLOAD
+	return select_victim_overload(sched_ctx);
+#else
+	return select_victim_round_robin(sched_ctx);
+#endif /* USE_OVERLOAD */
+}
 
 
-	return q;
+/**
+ * Return a worker from which a task can be stolen.
+ * This is a phony function used to call the right
+ * function depending on the value of USE_OVERLOAD.
+ */
+static inline unsigned select_worker(struct starpu_sched_ctx *sched_ctx)
+{
+#ifdef USE_OVERLOAD
+	return select_worker_overload(sched_ctx);
+#else
+	return select_worker_round_robin(sched_ctx);
+#endif /* USE_OVERLOAD */
 }
 }
 
 
-#endif
 
 
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #warning TODO rewrite ... this will not scale at all now
 #warning TODO rewrite ... this will not scale at all now
@@ -141,64 +234,73 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
 	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
 
 
 	struct starpu_task *task;
 	struct starpu_task *task;
+	struct _starpu_deque_jobq *q;
 
 
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 
 
-	struct starpu_deque_jobq_s *q;
+	STARPU_ASSERT(workerid != -1);
 
 
 	q = ws->queue_array[workerid];
 	q = ws->queue_array[workerid];
 
 
 	PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
 	PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
 
 
-	task = _starpu_deque_pop_task(q, -1);
-	if (task) {
+	task = _starpu_deque_pop_task(q, workerid);
+	if (task)
+	{
 		/* there was a local task */
 		/* there was a local task */
 		ws->performed_total++;
 		ws->performed_total++;
 		PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
 		PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
+		q->nprocessed++;
+		q->njobs--;
 		return task;
 		return task;
 	}
 	}
-	
+
 	/* we need to steal someone's job */
 	/* we need to steal someone's job */
-	struct starpu_deque_jobq_s *victimq;
-	victimq = select_victimq(ws, sched_ctx->nworkers);
+	unsigned victim = select_victim(sched_ctx);
+	struct _starpu_deque_jobq *victimq = ws->queue_array[victim];
 
 
 	task = _starpu_deque_pop_task(victimq, workerid);
 	task = _starpu_deque_pop_task(victimq, workerid);
-	if (task) {
-		STARPU_TRACE_WORK_STEALING(q, victimq);
+	if (task)
+	{
+		_STARPU_TRACE_WORK_STEALING(q, workerid);
 		ws->performed_total++;
 		ws->performed_total++;
-	}
 
 
-	PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
+		/* Beware : we have to increase the number of processed tasks of
+		 * the stealer, not the victim ! */
+		q->nprocessed++;
+		victimq->njobs--;
+	}
 
 
 	return task;
 	return task;
 }
 }
 
 
 int ws_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 int ws_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 {
 {
-	starpu_job_t j = _starpu_get_job_associated_to_task(task);
-
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
 	work_stealing_data *ws = (work_stealing_data*)sched_ctx->policy_data;
 
 
+	struct _starpu_deque_jobq *deque_queue;
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task); 
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 
 
+	_STARPU_PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
 
 
-        struct starpu_deque_jobq_s *deque_queue;
-	deque_queue = ws->queue_array[workerid];
+	/* If the current thread is not a worker but
+	 * the main thread (-1), we find the better one to
+	 * put task on its queue */
+	if (workerid == -1)
+		workerid = select_worker(sched_ctx);
 
 
-        PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
-	// XXX reuse ?
-        //total_number_of_jobs++;
+	deque_queue = ws->queue_array[workerid];
 
 
-        STARPU_TRACE_JOB_PUSH(task, 0);
-        starpu_job_list_push_front(deque_queue->jobq, j);
-        deque_queue->njobs++;
-        deque_queue->nprocessed++;
+	_STARPU_TRACE_JOB_PUSH(task, 0);
+	_starpu_job_list_push_back(deque_queue->jobq, j);
+	deque_queue->njobs++;
 
 
-        PTHREAD_COND_SIGNAL(&ws->sched_cond);
-        PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&ws->sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
 
 
-        return 0;
+	return 0;
 }
 }
 
 
 static void initialize_ws_policy_for_workers(unsigned sched_ctx_id, int *workerids,unsigned nnew_workers) 
 static void initialize_ws_policy_for_workers(unsigned sched_ctx_id, int *workerids,unsigned nnew_workers) 
@@ -213,6 +315,12 @@ static void initialize_ws_policy_for_workers(unsigned sched_ctx_id, int *workeri
 	{
 	{
 		workerid = workerids[i];
 		workerid = workerids[i];
 		ws->queue_array[workerid] = _starpu_create_deque();
 		ws->queue_array[workerid] = _starpu_create_deque();
+		/**
+		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
+		 * we need to initialize it at -1.
+		 */
+		ws->queue_array[workerid]->nprocessed = -1;
+		ws->queue_array[workerid]->njobs = 0;
 
 
 		sched_ctx->sched_mutex[workerid] = &ws->sched_mutex;
 		sched_ctx->sched_mutex[workerid] = &ws->sched_mutex;
 		sched_ctx->sched_cond[workerid] = &ws->sched_cond;
 		sched_ctx->sched_cond[workerid] = &ws->sched_cond;
@@ -226,11 +334,19 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 	sched_ctx->policy_data = (void*)ws;
 	sched_ctx->policy_data = (void*)ws;
 	
 	
 	unsigned nworkers = sched_ctx->nworkers;
 	unsigned nworkers = sched_ctx->nworkers;
-	ws->rr_worker = 0;
-	ws->queue_array = (struct starpu_deque_jobq_s**)malloc(STARPU_NMAXWORKERS*sizeof(struct starpu_deque_jobq_s*));
+	ws->last_pop_worker = 0;
+	ws->last_push_worker = 0;
+
+	/**
+	 * The first WS_POP_TASK will increase PERFORMED_TOTAL though no task was actually performed yet,
+	 * we need to initialize it at -1.
+	 */
+	ws->performed_total = -1;
 
 
-	PTHREAD_MUTEX_INIT(&ws->sched_mutex, NULL);
-	PTHREAD_COND_INIT(&ws->sched_cond, NULL);
+	ws->queue_array = (struct starpu_deque_jobq_s**)malloc(STARPU_NMAXWORKERS*sizeof(struct _starpu_deque_jobq*));
+
+	_STARPU_PTHREAD_MUTEX_INIT(&ws->sched_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&ws->sched_cond, NULL);
 
 
 	unsigned workerid_ctx;
 	unsigned workerid_ctx;
 	int workerid;
 	int workerid;
@@ -238,9 +354,21 @@ static void initialize_ws_policy(unsigned sched_ctx_id)
 	{
 	{
 		workerid = sched_ctx->workerids[workerid_ctx];
 		workerid = sched_ctx->workerids[workerid_ctx];
 		ws->queue_array[workerid] = _starpu_create_deque();
 		ws->queue_array[workerid] = _starpu_create_deque();
+		/**
+		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
+		 * we need to initialize it at -1.
+		 */
+		ws->queue_array[workerid]->nprocessed = -1;
+		ws->queue_array[workerid]->njobs = 0;
 
 
 		sched_ctx->sched_mutex[workerid] = &ws->sched_mutex;
 		sched_ctx->sched_mutex[workerid] = &ws->sched_mutex;
 		sched_ctx->sched_cond[workerid] = &ws->sched_cond;
 		sched_ctx->sched_cond[workerid] = &ws->sched_cond;
+
+#ifdef USE_OVERLOAD
+		enum starpu_perf_archtype perf_arch;
+		perf_arch = starpu_worker_get_perf_archtype(workerid);
+		calibration_value += (unsigned int) starpu_worker_get_relative_speedup(perf_arch);
+#endif /* USE_OVERLOAD */
 	}
 	}
 }
 }
 
 
@@ -266,11 +394,13 @@ static void deinit_ws_policy(unsigned sched_ctx_id)
 	}
 	}
 }
 }
 
 
-struct starpu_sched_policy_s _starpu_sched_ws_policy = {
+struct starpu_sched_policy _starpu_sched_ws_policy =
+{
 	.init_sched = initialize_ws_policy,
 	.init_sched = initialize_ws_policy,
 	.deinit_sched = deinit_ws_policy,
 	.deinit_sched = deinit_ws_policy,
 	.push_task = ws_push_task,
 	.push_task = ws_push_task,
 	.pop_task = ws_pop_task,
 	.pop_task = ws_pop_task,
+	.pre_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,
 	.pop_every_task = NULL,
 	.policy_name = "ws",
 	.policy_name = "ws",