Parcourir la source

Gordon now makes it possible to retrieve the actual execution time of a task so
we can implement the automatic calibration of the performance models on Cell as
well. Still a quick and pretty dirty implementation.

Cédric Augonnet il y a 16 ans
Parent
commit
487d630b25

+ 1 - 1
include/starpu-perfmodel.h

@@ -33,7 +33,7 @@ struct starpu_buffer_descr_t;
 
 /* on most system we will consider one or two architectures as all accelerators
    are likely to be identical */
-#define NARCH_VARIATIONS	2
+#define NARCH_VARIATIONS	3
 
 enum starpu_perf_archtype {
 	STARPU_CORE_DEFAULT = 0,

+ 1 - 0
include/starpu-task.h

@@ -17,6 +17,7 @@
 #ifndef __STARPU_TASK_H__
 #define __STARPU_TASK_H__
 
+#include <errno.h>
 #include <starpu_config.h>
 
 /* this is a randomly choosen value ... */

+ 64 - 0
src/core/mechanisms/deque_queues.c

@@ -155,6 +155,70 @@ job_t deque_pop_task(struct jobq_s *q)
 	return j;
 }
 
+struct job_list_s * deque_pop_every_task(struct jobq_s *q, uint32_t where)
+{
+	struct job_list_s *new_list, *old_list;
+
+	STARPU_ASSERT(q);
+	struct deque_jobq_s *deque_queue = q->queue;
+
+	/* block until some task is available in that queue */
+	pthread_mutex_lock(&q->activity_mutex);
+
+	if (deque_queue->njobs == 0)
+	{
+		new_list = NULL;
+	}
+	else {
+		/* there is a task */
+		old_list = deque_queue->jobq;
+		new_list = job_list_new();
+
+		unsigned new_list_size = 0;
+
+		job_itor_t i;
+		job_t next_job;
+		/* note that this starts at the _head_ of the list, so we put
+ 		 * elements at the back of the new list */
+		for(i = job_list_begin(old_list);
+			i != job_list_end(old_list);
+			i  = next_job)
+		{
+			next_job = job_list_next(i);
+
+			if (i->task->cl->where & where)
+			{
+				/* this elements can be moved into the new list */
+				new_list_size++;
+				
+				job_list_erase(old_list, i);
+				job_list_push_back(new_list, i);
+			}
+		}
+
+		if (new_list_size == 0)
+		{
+			/* the new list is empty ... */
+			job_list_delete(new_list);
+			new_list = NULL;
+		}
+		else
+		{
+			deque_queue->njobs -= new_list_size;
+	
+			/* we are sure that we got it now, so at worst, some people thought
+			 * there remained some work and will soon discover it is not true */
+			pthread_mutex_lock(sched_mutex);
+			total_number_of_jobs -= new_list_size;
+			pthread_mutex_unlock(sched_mutex);
+		}
+	}
+	
+	pthread_mutex_unlock(&q->activity_mutex);
+
+	return new_list;
+}
+
 job_t deque_non_blocking_pop_task(struct jobq_s *q)
 {
 	job_t j = NULL;

+ 8 - 0
src/core/perfmodel/perfmodel_history.c

@@ -118,6 +118,7 @@ static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned
 {
 	parse_per_arch_model_file(f, &model->per_arch[STARPU_CORE_DEFAULT], scan_history);
 	parse_per_arch_model_file(f, &model->per_arch[STARPU_CUDA_DEFAULT], scan_history);
+	parse_per_arch_model_file(f, &model->per_arch[STARPU_GORDON_DEFAULT], scan_history);
 }
 
 static void dump_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_t *per_arch_model)
@@ -153,6 +154,7 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
 {
 	dump_per_arch_model_file(f, &model->per_arch[STARPU_CORE_DEFAULT]);
 	dump_per_arch_model_file(f, &model->per_arch[STARPU_CUDA_DEFAULT]);
+	dump_per_arch_model_file(f, &model->per_arch[STARPU_GORDON_DEFAULT]);
 }
 
 static void initialize_per_arch_model(struct starpu_per_arch_perfmodel_t *per_arch_model)
@@ -165,6 +167,7 @@ static void initialize_model(struct starpu_perfmodel_t *model)
 {
 	initialize_per_arch_model(&model->per_arch[STARPU_CORE_DEFAULT]);
 	initialize_per_arch_model(&model->per_arch[STARPU_CUDA_DEFAULT]);
+	initialize_per_arch_model(&model->per_arch[STARPU_GORDON_DEFAULT]);
 }
 
 static struct starpu_model_list_t *registered_models = NULL;
@@ -208,6 +211,10 @@ void register_model(struct starpu_perfmodel_t *model)
 	get_model_debug_path(model, "core", debugpath, 256);
 	model->per_arch[STARPU_CORE_DEFAULT].debug_file = fopen(debugpath, "a+");
 	STARPU_ASSERT(model->per_arch[STARPU_CORE_DEFAULT].debug_file);
+
+	get_model_debug_path(model, "gordon", debugpath, 256);
+	model->per_arch[STARPU_GORDON_DEFAULT].debug_file = fopen(debugpath, "a+");
+	STARPU_ASSERT(model->per_arch[STARPU_GORDON_DEFAULT].debug_file);
 #endif
 
 	return;
@@ -249,6 +256,7 @@ void save_history_based_model(struct starpu_perfmodel_t *model)
 	fclose(f);
 
 #ifdef DEBUG_MODEL
+	fclose(model->gordon_debug_file);
 	fclose(model->cuda_debug_file);
 	fclose(model->core_debug_file);
 #endif

+ 26 - 0
src/core/policies/deque-modeling-policy.c

@@ -37,6 +37,31 @@ static job_t dm_pop_task(struct jobq_s *q)
 	return j;
 }
 
+static struct job_list_s *dm_pop_every_task(struct jobq_s *q, uint32_t where)
+{
+	struct job_list_s *new_list;
+
+	new_list = fifo_pop_every_task(q, where);
+	if (new_list) {
+		job_itor_t i;
+		for(i = job_list_begin(new_list);
+			i != job_list_end(new_list);
+			i = job_list_next(i))
+		{
+			struct fifo_jobq_s *fifo = q->queue;
+			double model = i->predicted;
+	
+			fifo->exp_len -= model;
+			fifo->exp_start = timing_now()/1000000 + model;
+			fifo->exp_end = fifo->exp_start + fifo->exp_len;
+		}
+	}
+
+	return new_list;
+}
+
+
+
 static int _dm_push_task(struct jobq_s *q __attribute__ ((unused)), job_t j, unsigned prio)
 {
 	/* find the queue */
@@ -130,6 +155,7 @@ static struct jobq_s *init_dm_fifo(void)
 	q->push_task = dm_push_task; 
 	q->push_prio_task = dm_push_prio_task; 
 	q->pop_task = dm_pop_task;
+	q->pop_every_task = dm_pop_every_task;
 	q->who = 0;
 
 	queue_array[nworkers++] = q;

+ 15 - 0
src/drivers/gordon/driver_gordon.c

@@ -234,12 +234,24 @@ static void gordon_callback_list_func(void *arg)
 
 //	fprintf(stderr, "gordon callback : push job j %p\n", task_wrapper->j);
 
+	unsigned task_cnt = 0;
+
 	/* XXX 0 was hardcoded */
 	take_mutex(&terminated_list_mutexes[0]);
 	while (!job_list_empty(wrapper_list))
 	{
 		job_t j = job_list_pop_back(wrapper_list);
+
+		struct gordon_ppu_job_s * gordon_task = &task_wrapper->gordon_job[task_cnt];
+		
+		if (j->task->cl->model && j->task->cl->model->benchmarking)
+		{
+			//fprintf(stderr, "gordon_task -> execution time %lx\n", gordon_task->measured);
+			update_perfmodel_history(j, STARPU_GORDON_DEFAULT, gordon_task->measured);
+		}
+
 		job_list_push_back(terminated_list, j);
+		task_cnt++;
 	}
 
 	/* the job list was allocated by the gordon driver itself */
@@ -338,6 +350,9 @@ int inject_task_list(struct job_list_s *list, struct worker_s *worker)
 
 		gordon_jobs[index].index = task->cl->gordon_func;
 
+		if (j->task->cl->model && j->task->cl->model->benchmarking)
+			gordon_jobs[index].sampling = 1;
+
 		/* we should not hardcore the memory node ... XXX */
 		unsigned memory_node = 0;
 		starpu_to_gordon_buffers(j, &gordon_jobs[index], memory_node);