瀏覽代碼

merge src

Andra Hugo 13 年之前
父節點
當前提交
8de7758807

+ 5 - 4
src/core/combined_workers.c

@@ -57,7 +57,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 	int new_workerid;
 	int new_workerid;
 
 
 	/* Return the number of actual workers. */
 	/* Return the number of actual workers. */
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
 	int basic_worker_count = (int)config->topology.nworkers;
 	int basic_worker_count = (int)config->topology.nworkers;
 	int combined_worker_id = (int)config->topology.ncombinedworkers;
 	int combined_worker_id = (int)config->topology.ncombinedworkers;
@@ -95,7 +95,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 	fprintf(stderr, "into worker %d\n", new_workerid);
 	fprintf(stderr, "into worker %d\n", new_workerid);
 #endif
 #endif
 
 
-	struct starpu_combined_worker_s *combined_worker =
+	struct _starpu_combined_worker *combined_worker =
 		&config->combined_workers[combined_worker_id];
 		&config->combined_workers[combined_worker_id];
 
 
 	combined_worker->worker_size = nworkers;
 	combined_worker->worker_size = nworkers;
@@ -132,7 +132,8 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 			&config->workers[id].initial_cpu_set);
 			&config->workers[id].initial_cpu_set);
 #else
 #else
 		int j;
 		int j;
-		for (j = 0; j < CPU_SETSIZE; j++) {
+		for (j = 0; j < CPU_SETSIZE; j++)
+		{
 			if (CPU_ISSET(j, &config->workers[id].initial_cpu_set))
 			if (CPU_ISSET(j, &config->workers[id].initial_cpu_set))
 				CPU_SET(j, &combined_worker->cpu_set);
 				CPU_SET(j, &combined_worker->cpu_set);
 		}
 		}
@@ -153,7 +154,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid)
 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid)
 {
 {
 	/* Check that this is the id of a combined worker */
 	/* Check that this is the id of a combined worker */
-	struct starpu_combined_worker_s *worker;
+	struct _starpu_combined_worker *worker;
 	worker = _starpu_get_combined_worker_struct(workerid);
 	worker = _starpu_get_combined_worker_struct(workerid);
 	STARPU_ASSERT(worker);
 	STARPU_ASSERT(worker);
 
 

+ 4 - 3
src/core/debug.c

@@ -25,6 +25,7 @@ static pthread_mutex_t logfile_mutex = PTHREAD_MUTEX_INITIALIZER;
 static FILE *logfile;
 static FILE *logfile;
 #endif
 #endif
 
 
+/* Tell gdb whether FXT is compiled in or not */
 int _starpu_use_fxt
 int _starpu_use_fxt
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 	= 1
 	= 1
@@ -36,7 +37,7 @@ void _starpu_open_debug_logfile(void)
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 	/* what is  the name of the file ? default = "starpu.log" */
 	/* what is  the name of the file ? default = "starpu.log" */
 	char *logfile_name;
 	char *logfile_name;
-	
+
 	logfile_name = getenv("STARPU_LOGFILENAME");
 	logfile_name = getenv("STARPU_LOGFILENAME");
 	if (!logfile_name)
 	if (!logfile_name)
 	{
 	{
@@ -60,9 +61,9 @@ void _starpu_print_to_logfile(const char *format STARPU_ATTRIBUTE_UNUSED, ...)
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 	va_list args;
 	va_list args;
 	va_start(args, format);
 	va_start(args, format);
-	PTHREAD_MUTEX_LOCK(&logfile_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&logfile_mutex);
 	vfprintf(logfile, format, args);
 	vfprintf(logfile, format, args);
-	PTHREAD_MUTEX_UNLOCK(&logfile_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&logfile_mutex);
 	va_end( args );
 	va_end( args );
 #endif
 #endif
 }
 }

+ 1 - 1
src/core/debug.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 83 - 75
src/core/dependencies/cg.c

@@ -1,7 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012 inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,32 +20,35 @@
 #include <common/config.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <common/utils.h>
 #include <core/jobs.h>
 #include <core/jobs.h>
+#include <core/task.h>
 #include <core/dependencies/cg.h>
 #include <core/dependencies/cg.h>
 #include <core/dependencies/tags.h>
 #include <core/dependencies/tags.h>
 
 
-void _starpu_cg_list_init(struct starpu_cg_list_s *list)
+void _starpu_cg_list_init(struct _starpu_cg_list *list)
 {
 {
-	list->nsuccs = 0;
+	_starpu_spin_init(&list->lock);
 	list->ndeps = 0;
 	list->ndeps = 0;
 	list->ndeps_completed = 0;
 	list->ndeps_completed = 0;
 
 
+	list->terminated = 0;
+
+	list->nsuccs = 0;
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 	/* this is a small initial default value ... may be changed */
 	/* this is a small initial default value ... may be changed */
 	list->succ_list_size = 0;
 	list->succ_list_size = 0;
-	list->succ =
-		(struct starpu_cg_s **) realloc(NULL, list->succ_list_size*sizeof(struct starpu_cg_s *));
+	list->succ = NULL;
 #endif
 #endif
 }
 }
 
 
-void _starpu_cg_list_deinit(struct starpu_cg_list_s *list)
+void _starpu_cg_list_deinit(struct _starpu_cg_list *list)
 {
 {
 	unsigned id;
 	unsigned id;
 	for (id = 0; id < list->nsuccs; id++)
 	for (id = 0; id < list->nsuccs; id++)
 	{
 	{
-		starpu_cg_t *cg = list->succ[id];
+		struct _starpu_cg *cg = list->succ[id];
 
 
 		/* We remove the reference on the completion group, and free it
 		/* We remove the reference on the completion group, and free it
-		 * if there is no more reference. */		
+		 * if there is no more reference. */
 		unsigned ntags = STARPU_ATOMIC_ADD(&cg->ntags, -1);
 		unsigned ntags = STARPU_ATOMIC_ADD(&cg->ntags, -1);
 		if (ntags == 0)
 		if (ntags == 0)
 			free(list->succ[id]);
 			free(list->succ[id]);
@@ -53,12 +57,19 @@ void _starpu_cg_list_deinit(struct starpu_cg_list_s *list)
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 	free(list->succ);
 	free(list->succ);
 #endif
 #endif
+	_starpu_spin_destroy(&list->lock);
 }
 }
 
 
-void _starpu_add_successor_to_cg_list(struct starpu_cg_list_s *successors, starpu_cg_t *cg)
+/* Returns whether the completion was already terminated, and caller should
+ * thus immediately proceed. */
+int _starpu_add_successor_to_cg_list(struct _starpu_cg_list *successors, struct _starpu_cg *cg)
 {
 {
+	int ret;
 	STARPU_ASSERT(cg);
 	STARPU_ASSERT(cg);
 
 
+	_starpu_spin_lock(&successors->lock);
+	ret = successors->terminated;
+
 	/* where should that cg should be put in the array ? */
 	/* where should that cg should be put in the array ? */
 	unsigned index = STARPU_ATOMIC_ADD(&successors->nsuccs, 1) - 1;
 	unsigned index = STARPU_ATOMIC_ADD(&successors->nsuccs, 1) - 1;
 
 
@@ -72,50 +83,58 @@ void _starpu_add_successor_to_cg_list(struct starpu_cg_list_s *successors, starp
 			successors->succ_list_size = 4;
 			successors->succ_list_size = 4;
 
 
 		/* NB: this is thread safe as the tag->lock is taken */
 		/* NB: this is thread safe as the tag->lock is taken */
-		successors->succ = (struct starpu_cg_s **) realloc(successors->succ, 
-			successors->succ_list_size*sizeof(struct starpu_cg_s *));
+		successors->succ = (struct _starpu_cg **) realloc(successors->succ,
+			successors->succ_list_size*sizeof(struct _starpu_cg *));
 	}
 	}
 #else
 #else
 	STARPU_ASSERT(index < STARPU_NMAXDEPS);
 	STARPU_ASSERT(index < STARPU_NMAXDEPS);
 #endif
 #endif
 	successors->succ[index] = cg;
 	successors->succ[index] = cg;
+	_starpu_spin_unlock(&successors->lock);
+
+	return ret;
 }
 }
 
 
-void _starpu_notify_cg(starpu_cg_t *cg)
+/* Note: in case of a tag, it must be already locked */
+void _starpu_notify_cg(struct _starpu_cg *cg)
 {
 {
 	STARPU_ASSERT(cg);
 	STARPU_ASSERT(cg);
 	unsigned remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 	unsigned remaining = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 
 
-	if (remaining == 0) {
+	if (remaining == 0)
+	{
 		cg->remaining = cg->ntags;
 		cg->remaining = cg->ntags;
 
 
-		struct starpu_tag_s *tag;
-		struct starpu_cg_list_s *tag_successors, *job_successors;
-		starpu_job_t j;
+		struct _starpu_tag *tag;
+		struct _starpu_cg_list *tag_successors, *job_successors;
+		struct _starpu_job *j;
 
 
 		/* the group is now completed */
 		/* the group is now completed */
-		switch (cg->cg_type) {
-			case STARPU_CG_APPS: {
+		switch (cg->cg_type)
+		{
+			case STARPU_CG_APPS:
+			{
 				/* this is a cg for an application waiting on a set of
 				/* this is a cg for an application waiting on a set of
-	 			 * tags, wake the thread */
-				PTHREAD_MUTEX_LOCK(&cg->succ.succ_apps.cg_mutex);
+				 * tags, wake the thread */
+				_STARPU_PTHREAD_MUTEX_LOCK(&cg->succ.succ_apps.cg_mutex);
 				cg->succ.succ_apps.completed = 1;
 				cg->succ.succ_apps.completed = 1;
-				PTHREAD_COND_SIGNAL(&cg->succ.succ_apps.cg_cond);
-				PTHREAD_MUTEX_UNLOCK(&cg->succ.succ_apps.cg_mutex);
+				_STARPU_PTHREAD_COND_SIGNAL(&cg->succ.succ_apps.cg_cond);
+				_STARPU_PTHREAD_MUTEX_UNLOCK(&cg->succ.succ_apps.cg_mutex);
 				break;
 				break;
 			}
 			}
 
 
-			case STARPU_CG_TAG: {
+			case STARPU_CG_TAG:
+			{
 				tag = cg->succ.tag;
 				tag = cg->succ.tag;
 				tag_successors = &tag->tag_successors;
 				tag_successors = &tag->tag_successors;
-	
+
 				tag_successors->ndeps_completed++;
 				tag_successors->ndeps_completed++;
 
 
-#ifdef STARPU_DEVEL
-#warning FIXME: who locks this?
-#endif
+				/* Note: the tag is already locked by the
+				 * caller. */
 				if ((tag->state == STARPU_BLOCKED) &&
 				if ((tag->state == STARPU_BLOCKED) &&
-					(tag_successors->ndeps == tag_successors->ndeps_completed)) {
+					(tag_successors->ndeps == tag_successors->ndeps_completed))
+				{
 					/* reset the counter so that we can reuse the completion group */
 					/* reset the counter so that we can reuse the completion group */
 					tag_successors->ndeps_completed = 0;
 					tag_successors->ndeps_completed = 0;
 					_starpu_tag_set_ready(tag);
 					_starpu_tag_set_ready(tag);
@@ -123,21 +142,29 @@ void _starpu_notify_cg(starpu_cg_t *cg)
 				break;
 				break;
 			}
 			}
 
 
- 		        case STARPU_CG_TASK: {
+ 		        case STARPU_CG_TASK:
+			{
 				j = cg->succ.job;
 				j = cg->succ.job;
 
 
+				_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+
 				job_successors = &j->job_successors;
 				job_successors = &j->job_successors;
 
 
 				unsigned ndeps_completed =
 				unsigned ndeps_completed =
 					STARPU_ATOMIC_ADD(&job_successors->ndeps_completed, 1);
 					STARPU_ATOMIC_ADD(&job_successors->ndeps_completed, 1);
 
 
-				if (job_successors->ndeps == ndeps_completed)
+				/* Need to atomically test submitted and check
+				 * dependencies, since this is concurrent with
+				 * _starpu_submit_job */
+				if (j->submitted && job_successors->ndeps == ndeps_completed)
 				{
 				{
 					/* Note that this also ensures that tag deps are
 					/* Note that this also ensures that tag deps are
 					 * fulfilled. This counter is reseted only when the
 					 * fulfilled. This counter is reseted only when the
 					 * dependencies are are all fulfilled) */
 					 * dependencies are are all fulfilled) */
-					_starpu_enforce_deps_and_schedule(j, 1);
-				}
+					_starpu_enforce_deps_and_schedule(j);
+				} else
+					_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+
 
 
 				break;
 				break;
 			}
 			}
@@ -148,21 +175,33 @@ void _starpu_notify_cg(starpu_cg_t *cg)
 	}
 	}
 }
 }
 
 
-void _starpu_notify_cg_list(struct starpu_cg_list_s *successors)
+/* Caller just has to promise that the list will not disappear.
+ * _starpu_notify_cg_list protects the list itself.
+ * No job lock should be held, since we might want to immediately call the callback of an empty task.
+ */
+void _starpu_notify_cg_list(struct _starpu_cg_list *successors)
 {
 {
-	unsigned nsuccs;
 	unsigned succ;
 	unsigned succ;
 
 
-	nsuccs = successors->nsuccs;
-
-	for (succ = 0; succ < nsuccs; succ++)
+	_starpu_spin_lock(&successors->lock);
+	successors->terminated = 1;
+	/* Note: some thread might be concurrently adding other items */
+	for (succ = 0; succ < successors->nsuccs; succ++)
 	{
 	{
-		struct starpu_cg_s *cg = successors->succ[succ];
+		struct _starpu_cg *cg = successors->succ[succ];
 		STARPU_ASSERT(cg);
 		STARPU_ASSERT(cg);
+		unsigned cg_type = cg->cg_type;
 
 
-		struct starpu_tag_s *cgtag = NULL;
+		if (cg_type == STARPU_CG_APPS)
+		{
+			/* Remove the temporary ref to the cg */
+			memmove(&successors->succ[succ], &successors->succ[succ+1], (successors->nsuccs-(succ+1)) * sizeof(successors->succ[succ]));
+			succ--;
+			successors->nsuccs--;
+		}
+		_starpu_spin_unlock(&successors->lock);
 
 
-		unsigned cg_type = cg->cg_type;
+		struct _starpu_tag *cgtag = NULL;
 
 
 		if (cg_type == STARPU_CG_TAG)
 		if (cg_type == STARPU_CG_TAG)
 		{
 		{
@@ -171,43 +210,12 @@ void _starpu_notify_cg_list(struct starpu_cg_list_s *successors)
 			_starpu_spin_lock(&cgtag->lock);
 			_starpu_spin_lock(&cgtag->lock);
 		}
 		}
 
 
-		if (cg_type == STARPU_CG_TASK)
-		{
-			starpu_job_t j = cg->succ.job;
-			PTHREAD_MUTEX_LOCK(&j->sync_mutex);
-		}			
-
 		_starpu_notify_cg(cg);
 		_starpu_notify_cg(cg);
 
 
-		if (cg_type == STARPU_CG_TASK)
-		{
-			starpu_job_t j = cg->succ.job;
-			
-			/* In case this task was immediately terminated, since
-			 * _starpu_notify_cg_list already hold the sync_mutex
-			 * lock, it is its reponsability to destroy the task if
-			 * needed. */
-			unsigned must_destroy_task = 0;
-			struct starpu_task *task = j->task;
-
-			if (j->submitted && (j->terminated > 0) && task->destroy && task->detach)
-				must_destroy_task = 1;
-
-			PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
-
-			if (must_destroy_task)
-				starpu_task_destroy(task);
-		}			
-
-		if (cg_type == STARPU_CG_APPS) {
-			/* Remove the temporary ref to the cg */
-			memmove(&successors->succ[succ], &successors->succ[succ+1], (nsuccs-(succ+1)) * sizeof(successors->succ[succ]));
-			succ--;
-			nsuccs--;
-			successors->nsuccs--;
-		}
-
 		if (cg_type == STARPU_CG_TAG)
 		if (cg_type == STARPU_CG_TAG)
 			_starpu_spin_unlock(&cgtag->lock);
 			_starpu_spin_unlock(&cgtag->lock);
+
+		_starpu_spin_lock(&successors->lock);
 	}
 	}
+	_starpu_spin_unlock(&successors->lock);
 }
 }

+ 44 - 24
src/core/dependencies/cg.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,54 +31,74 @@
 #define STARPU_NMAXDEPS	256
 #define STARPU_NMAXDEPS	256
 #endif
 #endif
 
 
-/* Completion Group list */
-struct starpu_cg_list_s {
-	unsigned nsuccs; /* how many successors ? */
+struct _starpu_job;
+
+/* Completion Group list, records both the number of expected notifications
+ * before the completion can start, and the list of successors when the
+ * completion is finished. */
+struct _starpu_cg_list
+{
+	/* Protects atomicity of the list and the terminated flag */
+	struct _starpu_spinlock lock;
+
+	/* Number of notifications to be waited for */
 	unsigned ndeps; /* how many deps ? */
 	unsigned ndeps; /* how many deps ? */
 	unsigned ndeps_completed; /* how many deps are done ? */
 	unsigned ndeps_completed; /* how many deps are done ? */
+
+	/* Whether the completion is finished. */
+	unsigned terminated;
+
+	/* List of successors */
+	unsigned nsuccs; /* how many successors ? */
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 	unsigned succ_list_size;
 	unsigned succ_list_size;
-	struct starpu_cg_s **succ;
+	struct _starpu_cg **succ;
 #else
 #else
-	struct starpu_cg_s *succ[STARPU_NMAXDEPS];
+	struct _starpu_cg *succ[STARPU_NMAXDEPS];
 #endif
 #endif
 };
 };
 
 
-#define STARPU_CG_APPS	(1<<0)
-#define STARPU_CG_TAG	(1<<1)
-#define STARPU_CG_TASK	(1<<2)
+enum _starpu_cg_type
+{
+	STARPU_CG_APPS=(1<<0),
+	STARPU_CG_TAG=(1<<1),
+	STARPU_CG_TASK=(1<<2)
+};
 
 
 /* Completion Group */
 /* Completion Group */
-typedef struct starpu_cg_s {
+struct _starpu_cg
+{
 	unsigned ntags; /* number of tags depended on */
 	unsigned ntags; /* number of tags depended on */
 	unsigned remaining; /* number of remaining tags */
 	unsigned remaining; /* number of remaining tags */
 
 
-	unsigned cg_type; /* STARPU_CG_APPS or STARPU_CG_TAG or STARPU_CG_TASK */
+	enum _starpu_cg_type cg_type;
 
 
-	union {
+	union
+	{
 		/* STARPU_CG_TAG */
 		/* STARPU_CG_TAG */
-		struct starpu_tag_s *tag;
+		struct _starpu_tag *tag;
 
 
 		/* STARPU_CG_TASK */
 		/* STARPU_CG_TASK */
-		struct starpu_job_s *job;
+		struct _starpu_job *job;
 
 
 		/* STARPU_CG_APPS */
 		/* STARPU_CG_APPS */
 		/* in case this completion group is related to an application,
 		/* in case this completion group is related to an application,
 		 * we have to explicitely wake the waiting thread instead of
 		 * we have to explicitely wake the waiting thread instead of
 		 * reschedule the corresponding task */
 		 * reschedule the corresponding task */
-		struct {
+		struct
+		{
 			unsigned completed;
 			unsigned completed;
 			pthread_mutex_t cg_mutex;
 			pthread_mutex_t cg_mutex;
 			pthread_cond_t cg_cond;
 			pthread_cond_t cg_cond;
 		} succ_apps;
 		} succ_apps;
 	} succ;
 	} succ;
-} starpu_cg_t;
-
-void _starpu_cg_list_init(struct starpu_cg_list_s *list);
-void _starpu_cg_list_deinit(struct starpu_cg_list_s *list);
-void _starpu_add_successor_to_cg_list(struct starpu_cg_list_s *successors, starpu_cg_t *cg);
-void _starpu_notify_cg(starpu_cg_t *cg);
-void _starpu_notify_cg_list(struct starpu_cg_list_s *successors);
-void _starpu_notify_task_dependencies(struct starpu_job_s *j);
+};
+
+void _starpu_cg_list_init(struct _starpu_cg_list *list);
+void _starpu_cg_list_deinit(struct _starpu_cg_list *list);
+int _starpu_add_successor_to_cg_list(struct _starpu_cg_list *successors, struct _starpu_cg *cg);
+void _starpu_notify_cg(struct _starpu_cg *cg);
+void _starpu_notify_cg_list(struct _starpu_cg_list *successors);
+void _starpu_notify_task_dependencies(struct _starpu_job *j);
 
 
 #endif // __CG_H__
 #endif // __CG_H__

+ 88 - 64
src/core/dependencies/data_concurrency.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,44 +31,45 @@
  */
  */
 
 
 /* the header lock must be taken by the caller */
 /* the header lock must be taken by the caller */
-static starpu_data_requester_t may_unlock_data_req_list_head(starpu_data_handle handle)
+static struct _starpu_data_requester *may_unlock_data_req_list_head(starpu_data_handle_t handle)
 {
 {
-	starpu_data_requester_list_t req_list;
+	struct _starpu_data_requester_list *req_list;
 
 
 	if (handle->reduction_refcnt > 0)
 	if (handle->reduction_refcnt > 0)
 	{
 	{
 		req_list = handle->reduction_req_list;
 		req_list = handle->reduction_req_list;
 	}
 	}
-	else {
-		if (starpu_data_requester_list_empty(handle->reduction_req_list))
+	else
+	{
+		if (_starpu_data_requester_list_empty(handle->reduction_req_list))
 			req_list = handle->req_list;
 			req_list = handle->req_list;
 		else
 		else
 			req_list = handle->reduction_req_list;
 			req_list = handle->reduction_req_list;
 	}
 	}
 
 
 	/* if there is no one to unlock ... */
 	/* if there is no one to unlock ... */
-	if (starpu_data_requester_list_empty(req_list))
+	if (_starpu_data_requester_list_empty(req_list))
 		return NULL;
 		return NULL;
 
 
 	/* if there is no reference to the data anymore, we can use it */
 	/* if there is no reference to the data anymore, we can use it */
 	if (handle->refcnt == 0)
 	if (handle->refcnt == 0)
-		return starpu_data_requester_list_pop_front(req_list);
+		return _starpu_data_requester_list_pop_front(req_list);
 
 
 	if (handle->current_mode == STARPU_W)
 	if (handle->current_mode == STARPU_W)
 		return NULL;
 		return NULL;
 
 
 	/* data->current_mode == STARPU_R, so we can process more readers */
 	/* data->current_mode == STARPU_R, so we can process more readers */
-	starpu_data_requester_t r = starpu_data_requester_list_front(req_list);
+	struct _starpu_data_requester *r = _starpu_data_requester_list_front(req_list);
 
 
-	starpu_access_mode r_mode = r->mode;
+	enum starpu_access_mode r_mode = r->mode;
 	if (r_mode == STARPU_RW)
 	if (r_mode == STARPU_RW)
 		r_mode = STARPU_W;
 		r_mode = STARPU_W;
-	
+
 	/* If this is a STARPU_R, STARPU_SCRATCH or STARPU_REDUX type of
 	/* If this is a STARPU_R, STARPU_SCRATCH or STARPU_REDUX type of
 	 * access, we only proceed if the cuurrent mode is the same as the
 	 * access, we only proceed if the cuurrent mode is the same as the
 	 * requested mode. */
 	 * requested mode. */
 	if (r_mode == handle->current_mode)
 	if (r_mode == handle->current_mode)
-		return starpu_data_requester_list_pop_front(req_list);
+		return _starpu_data_requester_list_pop_front(req_list);
 	else
 	else
 		return NULL;
 		return NULL;
 }
 }
@@ -78,9 +79,9 @@ static starpu_data_requester_t may_unlock_data_req_list_head(starpu_data_handle
  * with the current mode, the request is put in the per-handle list of
  * with the current mode, the request is put in the per-handle list of
  * "requesters", and this function returns 1. */
  * "requesters", and this function returns 1. */
 static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_codelet,
 static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_codelet,
-					starpu_data_handle handle, starpu_access_mode mode,
-					void (*callback)(void *), void *argcb,
-					starpu_job_t j, unsigned buffer_index)
+						       starpu_data_handle_t handle, enum starpu_access_mode mode,
+						       void (*callback)(void *), void *argcb,
+						       struct _starpu_job *j, unsigned buffer_index)
 {
 {
 	if (mode == STARPU_RW)
 	if (mode == STARPU_RW)
 		mode = STARPU_W;
 		mode = STARPU_W;
@@ -93,7 +94,8 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 		while (_starpu_spin_trylock(&handle->header_lock))
 		while (_starpu_spin_trylock(&handle->header_lock))
 			_starpu_datawizard_progress(_starpu_get_local_memory_node(), 0);
 			_starpu_datawizard_progress(_starpu_get_local_memory_node(), 0);
 	}
 	}
-	else {
+	else
+	{
 		_starpu_spin_lock(&handle->header_lock);
 		_starpu_spin_lock(&handle->header_lock);
 	}
 	}
 
 
@@ -115,7 +117,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 	 * current one, we can proceed. */
 	 * current one, we can proceed. */
 	unsigned put_in_list = 1;
 	unsigned put_in_list = 1;
 
 
-	starpu_access_mode previous_mode = handle->current_mode;
+	enum starpu_access_mode previous_mode = handle->current_mode;
 
 
 	if (!frozen && ((handle->refcnt == 0) || (!(mode == STARPU_W) && (handle->current_mode == mode))))
 	if (!frozen && ((handle->refcnt == 0) || (!(mode == STARPU_W) && (handle->current_mode == mode))))
 	{
 	{
@@ -125,13 +127,14 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
 
 		if ((handle->reduction_refcnt == 0) && (previous_mode == STARPU_REDUX) && (mode != STARPU_REDUX))
 		if ((handle->reduction_refcnt == 0) && (previous_mode == STARPU_REDUX) && (mode != STARPU_REDUX))
 		{
 		{
-			starpu_data_end_reduction_mode(handle);
+			_starpu_data_end_reduction_mode(handle);
 
 
 			/* Since we need to perform a mode change, we freeze
 			/* Since we need to perform a mode change, we freeze
 			 * the request if needed. */
 			 * the request if needed. */
 			put_in_list = (handle->reduction_refcnt > 0);
 			put_in_list = (handle->reduction_refcnt > 0);
 		}
 		}
-		else {
+		else
+		{
 			put_in_list = 0;
 			put_in_list = 0;
 		}
 		}
 	}
 	}
@@ -140,32 +143,35 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 	{
 	{
 		/* there cannot be multiple writers or a new writer
 		/* there cannot be multiple writers or a new writer
 		 * while the data is in read mode */
 		 * while the data is in read mode */
-		
+
+		handle->busy_count++;
 		/* enqueue the request */
 		/* enqueue the request */
-		starpu_data_requester_t r = starpu_data_requester_new();
-			r->mode = mode;
-			r->is_requested_by_codelet = request_from_codelet;
-			r->j = j;
-			r->buffer_index = buffer_index;
-			r->ready_data_callback = callback;
-			r->argcb = argcb;
+		struct _starpu_data_requester *r = _starpu_data_requester_new();
+		r->mode = mode;
+		r->is_requested_by_codelet = request_from_codelet;
+		r->j = j;
+		r->buffer_index = buffer_index;
+		r->ready_data_callback = callback;
+		r->argcb = argcb;
 
 
 		/* We put the requester in a specific list if this is a reduction task */
 		/* We put the requester in a specific list if this is a reduction task */
-		starpu_data_requester_list_t req_list =
+		struct _starpu_data_requester_list *req_list =
 			is_a_reduction_task?handle->reduction_req_list:handle->req_list;
 			is_a_reduction_task?handle->reduction_req_list:handle->req_list;
 
 
-		starpu_data_requester_list_push_back(req_list, r);
+		_starpu_data_requester_list_push_back(req_list, r);
 
 
 		/* failed */
 		/* failed */
 		put_in_list = 1;
 		put_in_list = 1;
 	}
 	}
-	else {
+	else
+	{
 		handle->refcnt++;
 		handle->refcnt++;
+		handle->busy_count++;
 
 
 		handle->current_mode = mode;
 		handle->current_mode = mode;
 
 
 		if ((mode == STARPU_REDUX) && (previous_mode != STARPU_REDUX))
 		if ((mode == STARPU_REDUX) && (previous_mode != STARPU_REDUX))
-			starpu_data_start_reduction_mode(handle);
+			_starpu_data_start_reduction_mode(handle);
 
 
 		/* success */
 		/* success */
 		put_in_list = 0;
 		put_in_list = 0;
@@ -176,32 +182,37 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 
 
 }
 }
 
 
-
-unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle handle, starpu_access_mode mode,
-						void (*callback)(void *), void *argcb)
+unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle, enum starpu_access_mode mode,
+							  void (*callback)(void *), void *argcb)
 {
 {
 	return _starpu_attempt_to_submit_data_request(0, handle, mode, callback, argcb, NULL, 0);
 	return _starpu_attempt_to_submit_data_request(0, handle, mode, callback, argcb, NULL, 0);
 }
 }
 
 
-static unsigned attempt_to_submit_data_request_from_job(starpu_job_t j, unsigned buffer_index)
+static unsigned attempt_to_submit_data_request_from_job(struct _starpu_job *j, unsigned buffer_index)
 {
 {
-	/* Note that we do not access j->task->buffers, but j->ordered_buffers
+	/* Note that we do not access j->task->handles, but j->ordered_buffers
 	 * which is a sorted copy of it. */
 	 * which is a sorted copy of it. */
-	starpu_data_handle handle = j->ordered_buffers[buffer_index].handle;
-	starpu_access_mode mode = j->ordered_buffers[buffer_index].mode;
+	starpu_data_handle_t handle = j->ordered_buffers[buffer_index].handle;
+	enum starpu_access_mode mode = j->ordered_buffers[buffer_index].mode;
 
 
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
-
 }
 }
 
 
-static unsigned _submit_job_enforce_data_deps(starpu_job_t j, unsigned start_buffer_index)
+static unsigned _submit_job_enforce_data_deps(struct _starpu_job *j, unsigned start_buffer_index)
 {
 {
 	unsigned buf;
 	unsigned buf;
 
 
 	unsigned nbuffers = j->task->cl->nbuffers;
 	unsigned nbuffers = j->task->cl->nbuffers;
 	for (buf = start_buffer_index; buf < nbuffers; buf++)
 	for (buf = start_buffer_index; buf < nbuffers; buf++)
 	{
 	{
-                if (attempt_to_submit_data_request_from_job(j, buf)) {
+		if (buf && j->ordered_buffers[buf-1].handle == j->ordered_buffers[buf].handle)
+			/* We have already requested this data, skip it. This
+			 * depends on ordering putting writes before reads, see
+			 * _starpu_compar_handles.  */
+			continue;
+
+                if (attempt_to_submit_data_request_from_job(j, buf))
+		{
                         j->task->status = STARPU_TASK_BLOCKED_ON_DATA;
                         j->task->status = STARPU_TASK_BLOCKED_ON_DATA;
 			return 1;
 			return 1;
                 }
                 }
@@ -214,9 +225,9 @@ static unsigned _submit_job_enforce_data_deps(starpu_job_t j, unsigned start_buf
    with concurrent data-access at the same time in the scheduling engine (eg.
    with concurrent data-access at the same time in the scheduling engine (eg.
    there can be 2 tasks reading a piece of data, but there cannot be one
    there can be 2 tasks reading a piece of data, but there cannot be one
    reading and another writing) */
    reading and another writing) */
-unsigned _starpu_submit_job_enforce_data_deps(starpu_job_t j)
+unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j)
 {
 {
-	struct starpu_codelet_t *cl = j->task->cl;
+	struct starpu_codelet *cl = j->task->cl;
 
 
 	if ((cl == NULL) || (cl->nbuffers == 0))
 	if ((cl == NULL) || (cl->nbuffers == 0))
 		return 0;
 		return 0;
@@ -224,41 +235,49 @@ unsigned _starpu_submit_job_enforce_data_deps(starpu_job_t j)
 	/* Compute an ordered list of the different pieces of data so that we
 	/* Compute an ordered list of the different pieces of data so that we
 	 * grab then according to a total order, thus avoiding a deadlock
 	 * grab then according to a total order, thus avoiding a deadlock
 	 * condition */
 	 * condition */
-	memcpy(j->ordered_buffers, j->task->buffers, cl->nbuffers*sizeof(starpu_buffer_descr));
+	unsigned i;
+	for (i=0 ; i<cl->nbuffers ; i++)
+	{
+		j->ordered_buffers[i].handle = j->task->handles[i];
+		j->ordered_buffers[i].mode = j->task->cl->modes[i];
+	}
+
 	_starpu_sort_task_handles(j->ordered_buffers, cl->nbuffers);
 	_starpu_sort_task_handles(j->ordered_buffers, cl->nbuffers);
 
 
 	return _submit_job_enforce_data_deps(j, 0);
 	return _submit_job_enforce_data_deps(j, 0);
 }
 }
 
 
-static unsigned unlock_one_requester(starpu_data_requester_t r)
+static unsigned unlock_one_requester(struct _starpu_data_requester *r)
 {
 {
-	starpu_job_t j = r->j;
+	struct _starpu_job *j = r->j;
 	unsigned nbuffers = j->task->cl->nbuffers;
 	unsigned nbuffers = j->task->cl->nbuffers;
 	unsigned buffer_index = r->buffer_index;
 	unsigned buffer_index = r->buffer_index;
 
 
 	if (buffer_index + 1 < nbuffers)
 	if (buffer_index + 1 < nbuffers)
-	{
 		/* not all buffers are protected yet */
 		/* not all buffers are protected yet */
 		return _submit_job_enforce_data_deps(j, buffer_index + 1);
 		return _submit_job_enforce_data_deps(j, buffer_index + 1);
-	}
 	else
 	else
 		return 0;
 		return 0;
 }
 }
 
 
 /* The header lock must already be taken by the caller */
 /* The header lock must already be taken by the caller */
-void _starpu_notify_data_dependencies(starpu_data_handle handle)
+void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 {
 {
 	/* A data access has finished so we remove a reference. */
 	/* A data access has finished so we remove a reference. */
 	STARPU_ASSERT(handle->refcnt > 0);
 	STARPU_ASSERT(handle->refcnt > 0);
 	handle->refcnt--;
 	handle->refcnt--;
+	STARPU_ASSERT(handle->busy_count > 0);
+	handle->busy_count--;
+	_starpu_data_check_not_busy(handle);
 
 
 	/* The handle has been destroyed in between (eg. this was a temporary
 	/* The handle has been destroyed in between (eg. this was a temporary
 	 * handle created for a reduction.) */
 	 * handle created for a reduction.) */
 	if (handle->lazy_unregister && handle->refcnt == 0)
 	if (handle->lazy_unregister && handle->refcnt == 0)
 	{
 	{
+		_starpu_spin_unlock(&handle->header_lock);
 		starpu_data_unregister_no_coherency(handle);
 		starpu_data_unregister_no_coherency(handle);
 		/* Warning: in case we unregister the handle, we must be sure
 		/* Warning: in case we unregister the handle, we must be sure
-		 * that the application will not try to unlock the header after
+		 * that the caller will not try to unlock the header after
 		 * !*/
 		 * !*/
 		return;
 		return;
 	}
 	}
@@ -270,28 +289,28 @@ void _starpu_notify_data_dependencies(starpu_data_handle handle)
 		//fprintf(stderr, "NOTIFY REDUCTION TASK RED REFCNT %d\n", handle->reduction_refcnt);
 		//fprintf(stderr, "NOTIFY REDUCTION TASK RED REFCNT %d\n", handle->reduction_refcnt);
 		handle->reduction_refcnt--;
 		handle->reduction_refcnt--;
 		if (handle->reduction_refcnt == 0)
 		if (handle->reduction_refcnt == 0)
-			starpu_data_end_reduction_mode_terminate(handle);
+			_starpu_data_end_reduction_mode_terminate(handle);
 	}
 	}
 
 
-
-	starpu_data_requester_t r;
+	struct _starpu_data_requester *r;
 	while ((r = may_unlock_data_req_list_head(handle)))
 	while ((r = may_unlock_data_req_list_head(handle)))
 	{
 	{
 		/* STARPU_RW accesses are treated as STARPU_W */
 		/* STARPU_RW accesses are treated as STARPU_W */
-		starpu_access_mode r_mode = r->mode;
+		enum starpu_access_mode r_mode = r->mode;
 		if (r_mode == STARPU_RW)
 		if (r_mode == STARPU_RW)
 			r_mode = STARPU_W;
 			r_mode = STARPU_W;
 
 
 		int put_in_list = 1;
 		int put_in_list = 1;
 		if ((handle->reduction_refcnt == 0) && (handle->current_mode == STARPU_REDUX) && (r_mode != STARPU_REDUX))
 		if ((handle->reduction_refcnt == 0) && (handle->current_mode == STARPU_REDUX) && (r_mode != STARPU_REDUX))
 		{
 		{
-			starpu_data_end_reduction_mode(handle);
+			_starpu_data_end_reduction_mode(handle);
 
 
 			/* Since we need to perform a mode change, we freeze
 			/* Since we need to perform a mode change, we freeze
 			 * the request if needed. */
 			 * the request if needed. */
 			put_in_list = (handle->reduction_refcnt > 0);
 			put_in_list = (handle->reduction_refcnt > 0);
 		}
 		}
-		else {
+		else
+		{
 			put_in_list = 0;
 			put_in_list = 0;
 		}
 		}
 
 
@@ -299,14 +318,16 @@ void _starpu_notify_data_dependencies(starpu_data_handle handle)
 		{
 		{
 			/* We need to put the request back because we must
 			/* We need to put the request back because we must
 			 * perform a reduction before. */
 			 * perform a reduction before. */
-			starpu_data_requester_list_push_front(handle->req_list, r);
+			_starpu_data_requester_list_push_front(handle->req_list, r);
 		}
 		}
-		else {
+		else
+		{
 			/* The data is now attributed to that request so we put a
 			/* The data is now attributed to that request so we put a
 			 * reference on it. */
 			 * reference on it. */
 			handle->refcnt++;
 			handle->refcnt++;
-		
-			starpu_access_mode previous_mode = handle->current_mode;
+			handle->busy_count++;
+
+			enum starpu_access_mode previous_mode = handle->current_mode;
 			handle->current_mode = r_mode;
 			handle->current_mode = r_mode;
 
 
 			/* In case we enter in a reduction mode, we invalidate all per
 			/* In case we enter in a reduction mode, we invalidate all per
@@ -314,14 +335,14 @@ void _starpu_notify_data_dependencies(starpu_data_handle handle)
 			 * kept intact because we'll reduce a valid copy of the
 			 * kept intact because we'll reduce a valid copy of the
 			 * "per-node replicate" with the per-worker replicates .*/
 			 * "per-node replicate" with the per-worker replicates .*/
 			if ((r_mode == STARPU_REDUX) && (previous_mode != STARPU_REDUX))
 			if ((r_mode == STARPU_REDUX) && (previous_mode != STARPU_REDUX))
-				starpu_data_start_reduction_mode(handle);
+				_starpu_data_start_reduction_mode(handle);
 
 
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 
 
 			if (r->is_requested_by_codelet)
 			if (r->is_requested_by_codelet)
 			{
 			{
 				if (!unlock_one_requester(r))
 				if (!unlock_one_requester(r))
-					_starpu_push_task(r->j, 0);
+					_starpu_push_task(r->j);
 			}
 			}
 			else
 			else
 			{
 			{
@@ -331,9 +352,12 @@ void _starpu_notify_data_dependencies(starpu_data_handle handle)
 				r->ready_data_callback(r->argcb);
 				r->ready_data_callback(r->argcb);
 			}
 			}
 
 
-			starpu_data_requester_delete(r);
-			
+			_starpu_data_requester_delete(r);
+
 			_starpu_spin_lock(&handle->header_lock);
 			_starpu_spin_lock(&handle->header_lock);
+			STARPU_ASSERT(handle->busy_count > 0);
+			handle->busy_count--;
+			_starpu_data_check_not_busy(handle);
 		}
 		}
 	}
 	}
 }
 }

+ 6 - 6
src/core/dependencies/data_concurrency.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,13 +20,13 @@
 
 
 #include <core/jobs.h>
 #include <core/jobs.h>
 
 
-unsigned _starpu_submit_job_enforce_data_deps(starpu_job_t j);
+unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j);
 
 
-void _starpu_notify_data_dependencies(starpu_data_handle handle);
+void _starpu_notify_data_dependencies(starpu_data_handle_t handle);
 
 
-unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle handle,
-		starpu_access_mode mode,
-		void (*callback)(void *), void *argcb);
+unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
+							  enum starpu_access_mode mode,
+							  void (*callback)(void *), void *argcb);
 
 
 #endif // __DATA_CONCURRENCY_H__
 #endif // __DATA_CONCURRENCY_H__
 
 

+ 5 - 5
src/core/dependencies/dependencies.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,15 +24,15 @@
 #include <core/sched_policy.h>
 #include <core/sched_policy.h>
 #include <core/dependencies/data_concurrency.h>
 #include <core/dependencies/data_concurrency.h>
 
 
-/* We assume that j->sync_mutex is taken by the caller */
-void _starpu_notify_dependencies(struct starpu_job_s *j)
+/* We assume that the job will not disappear under our hands */
+void _starpu_notify_dependencies(struct _starpu_job *j)
 {
 {
 	STARPU_ASSERT(j);
 	STARPU_ASSERT(j);
 	STARPU_ASSERT(j->task);
 	STARPU_ASSERT(j->task);
 
 
 	/* unlock tasks depending on that task */
 	/* unlock tasks depending on that task */
 	_starpu_notify_task_dependencies(j);
 	_starpu_notify_task_dependencies(j);
-	
+
 	/* unlock tags depending on that task */
 	/* unlock tags depending on that task */
 	if (j->task->use_tag)
 	if (j->task->use_tag)
 		_starpu_notify_tag_dependencies(j->tag);
 		_starpu_notify_tag_dependencies(j->tag);

+ 69 - 47
src/core/dependencies/htable.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,31 +18,30 @@
 #include <core/dependencies/htable.h>
 #include <core/dependencies/htable.h>
 #include <string.h>
 #include <string.h>
 
 
-void *_starpu_htbl_search_tag(starpu_htbl_node_t *htbl, starpu_tag_t tag)
+void *_starpu_htbl_search_tag(struct _starpu_htbl_node *htbl, starpu_tag_t tag)
 {
 {
 	unsigned currentbit;
 	unsigned currentbit;
-	starpu_htbl_node_t *current_htbl = htbl;
+	struct _starpu_htbl_node *current_htbl = htbl;
 
 
-	/* 000000000001111 with STARPU_HTBL_NODE_SIZE 1's */
-	starpu_tag_t mask = (1<<STARPU_HTBL_NODE_SIZE)-1;
+	/* 000000000001111 with _STARPU_HTBL_NODE_SIZE 1's */
+	starpu_tag_t mask = (1<<_STARPU_HTBL_NODE_SIZE)-1;
 
 
-	for(currentbit = 0; currentbit < STARPU_TAG_SIZE; currentbit+=STARPU_HTBL_NODE_SIZE)
+	for(currentbit = 0; currentbit < _STARPU_TAG_SIZE; currentbit+=_STARPU_HTBL_NODE_SIZE)
 	{
 	{
-	
 	//	printf("search : current bit = %d \n", currentbit);
 	//	printf("search : current bit = %d \n", currentbit);
 		if (STARPU_UNLIKELY(current_htbl == NULL))
 		if (STARPU_UNLIKELY(current_htbl == NULL))
 			return NULL;
 			return NULL;
 
 
-		/* 0000000000001111 
+		/* 0000000000001111
 		 *     | currentbit
 		 *     | currentbit
 		 * 0000111100000000 = offloaded_mask
 		 * 0000111100000000 = offloaded_mask
 		 *         |last_currentbit
 		 *         |last_currentbit
 		 * */
 		 * */
 
 
-		unsigned last_currentbit = 
-			STARPU_TAG_SIZE - (currentbit + STARPU_HTBL_NODE_SIZE);
+		unsigned last_currentbit =
+			_STARPU_TAG_SIZE - (currentbit + _STARPU_HTBL_NODE_SIZE);
 		starpu_tag_t offloaded_mask = mask << last_currentbit;
 		starpu_tag_t offloaded_mask = mask << last_currentbit;
-		unsigned current_index = 
+		unsigned current_index =
 			(tag & (offloaded_mask)) >> (last_currentbit);
 			(tag & (offloaded_mask)) >> (last_currentbit);
 
 
 		current_htbl = current_htbl->children[current_index];
 		current_htbl = current_htbl->children[current_index];
@@ -55,49 +54,48 @@ void *_starpu_htbl_search_tag(starpu_htbl_node_t *htbl, starpu_tag_t tag)
  * returns the previous value of the tag, or NULL else
  * returns the previous value of the tag, or NULL else
  */
  */
 
 
-void *_starpu_htbl_insert_tag(starpu_htbl_node_t **htbl, starpu_tag_t tag, void *entry)
+void *_starpu_htbl_insert_tag(struct _starpu_htbl_node **htbl, starpu_tag_t tag, void *entry)
 {
 {
-
 	unsigned currentbit;
 	unsigned currentbit;
-	starpu_htbl_node_t **current_htbl_ptr = htbl;
-	starpu_htbl_node_t *previous_htbl_ptr = NULL;
+	struct _starpu_htbl_node **current_htbl_ptr = htbl;
+	struct _starpu_htbl_node *previous_htbl_ptr = NULL;
 
 
-	/* 000000000001111 with STARPU_HTBL_NODE_SIZE 1's */
-	starpu_tag_t mask = (1<<STARPU_HTBL_NODE_SIZE)-1;
+	/* 000000000001111 with _STARPU_HTBL_NODE_SIZE 1's */
+	starpu_tag_t mask = (1<<_STARPU_HTBL_NODE_SIZE)-1;
 
 
-	for(currentbit = 0; currentbit < STARPU_TAG_SIZE; currentbit+=STARPU_HTBL_NODE_SIZE)
+	for(currentbit = 0; currentbit < _STARPU_TAG_SIZE; currentbit+=_STARPU_HTBL_NODE_SIZE)
 	{
 	{
-		if (*current_htbl_ptr == NULL) {
+		if (*current_htbl_ptr == NULL)
+		{
 			/* TODO pad to change that 1 into 16 ? */
 			/* TODO pad to change that 1 into 16 ? */
-			*current_htbl_ptr = (starpu_htbl_node_t *) calloc(1, sizeof(starpu_htbl_node_t));
-			assert(*current_htbl_ptr);
+			*current_htbl_ptr = (struct _starpu_htbl_node *) calloc(1, sizeof(struct _starpu_htbl_node));
+			STARPU_ASSERT(*current_htbl_ptr);
 
 
 			if (previous_htbl_ptr)
 			if (previous_htbl_ptr)
 				previous_htbl_ptr->nentries++;
 				previous_htbl_ptr->nentries++;
 		}
 		}
 
 
-		/* 0000000000001111 
+		/* 0000000000001111
 		 *     | currentbit
 		 *     | currentbit
 		 * 0000111100000000 = offloaded_mask
 		 * 0000111100000000 = offloaded_mask
 		 *         |last_currentbit
 		 *         |last_currentbit
 		 * */
 		 * */
 
 
-		unsigned last_currentbit = 
-			STARPU_TAG_SIZE - (currentbit + STARPU_HTBL_NODE_SIZE);
+		unsigned last_currentbit =
+			_STARPU_TAG_SIZE - (currentbit + _STARPU_HTBL_NODE_SIZE);
 		starpu_tag_t offloaded_mask = mask << last_currentbit;
 		starpu_tag_t offloaded_mask = mask << last_currentbit;
-		unsigned current_index = 
+		unsigned current_index =
 			(tag & (offloaded_mask)) >> (last_currentbit);
 			(tag & (offloaded_mask)) >> (last_currentbit);
 
 
 		previous_htbl_ptr = *current_htbl_ptr;
 		previous_htbl_ptr = *current_htbl_ptr;
-		current_htbl_ptr = 
+		current_htbl_ptr =
 			&((*current_htbl_ptr)->children[current_index]);
 			&((*current_htbl_ptr)->children[current_index]);
-
 	}
 	}
 
 
-	/* current_htbl either contains NULL or a previous entry 
+	/* current_htbl either contains NULL or a previous entry
 	 * we overwrite it anyway */
 	 * we overwrite it anyway */
 	void *old_entry = *current_htbl_ptr;
 	void *old_entry = *current_htbl_ptr;
-	*current_htbl_ptr = (starpu_htbl_node_t *) entry;
+	*current_htbl_ptr = (struct _starpu_htbl_node *) entry;
 
 
 	if (!old_entry)
 	if (!old_entry)
 		previous_htbl_ptr->nentries++;
 		previous_htbl_ptr->nentries++;
@@ -106,43 +104,48 @@ void *_starpu_htbl_insert_tag(starpu_htbl_node_t **htbl, starpu_tag_t tag, void
 }
 }
 
 
 /* returns the entry corresponding to the tag and remove it from the htbl */
 /* returns the entry corresponding to the tag and remove it from the htbl */
-void *_starpu_htbl_remove_tag(starpu_htbl_node_t *htbl, starpu_tag_t tag)
+void *_starpu_htbl_remove_tag(struct _starpu_htbl_node **htbl, starpu_tag_t tag)
 {
 {
 	/* NB : if the entry is "NULL", we assume this means it is not present XXX */
 	/* NB : if the entry is "NULL", we assume this means it is not present XXX */
 	unsigned currentbit;
 	unsigned currentbit;
-	starpu_htbl_node_t *current_htbl_ptr = htbl;
+	struct _starpu_htbl_node **current_htbl_ptr_parent = htbl;
+	struct _starpu_htbl_node *current_htbl_ptr = *current_htbl_ptr_parent;
 
 
 	/* remember the path to the tag */
 	/* remember the path to the tag */
-	starpu_htbl_node_t *path[(STARPU_TAG_SIZE + STARPU_HTBL_NODE_SIZE - 1)/(STARPU_HTBL_NODE_SIZE)];
+	struct _starpu_htbl_node *path[(_STARPU_TAG_SIZE + _STARPU_HTBL_NODE_SIZE - 1)/(_STARPU_HTBL_NODE_SIZE)];
+	struct _starpu_htbl_node **path_parent[(_STARPU_TAG_SIZE + _STARPU_HTBL_NODE_SIZE - 1)/(_STARPU_HTBL_NODE_SIZE)];
 
 
-	/* 000000000001111 with STARPU_HTBL_NODE_SIZE 1's */
-	starpu_tag_t mask = (1<<STARPU_HTBL_NODE_SIZE)-1;
+	/* 000000000001111 with _STARPU_HTBL_NODE_SIZE 1's */
+	starpu_tag_t mask = (1<<_STARPU_HTBL_NODE_SIZE)-1;
 	int level, maxlevel;
 	int level, maxlevel;
 	unsigned tag_is_present = 1;
 	unsigned tag_is_present = 1;
 
 
-	for(currentbit = 0, level = 0; currentbit < STARPU_TAG_SIZE; currentbit+=STARPU_HTBL_NODE_SIZE, level++)
+	for(currentbit = 0, level = 0; currentbit < _STARPU_TAG_SIZE; currentbit+=_STARPU_HTBL_NODE_SIZE, level++)
 	{
 	{
+		path_parent[level] = current_htbl_ptr_parent;
 		path[level] = current_htbl_ptr;
 		path[level] = current_htbl_ptr;
 
 
-		if (STARPU_UNLIKELY(!current_htbl_ptr)) {
+		if (STARPU_UNLIKELY(!current_htbl_ptr))
+		{
 			tag_is_present = 0;
 			tag_is_present = 0;
 			break;
 			break;
 		}
 		}
 
 
-		/* 0000000000001111 
+		/* 0000000000001111
 		 *     | currentbit
 		 *     | currentbit
 		 * 0000111100000000 = offloaded_mask
 		 * 0000111100000000 = offloaded_mask
 		 *         |last_currentbit
 		 *         |last_currentbit
 		 * */
 		 * */
 
 
-		unsigned last_currentbit = 
-			STARPU_TAG_SIZE - (currentbit + STARPU_HTBL_NODE_SIZE);
+		unsigned last_currentbit =
+			_STARPU_TAG_SIZE - (currentbit + _STARPU_HTBL_NODE_SIZE);
 		starpu_tag_t offloaded_mask = mask << last_currentbit;
 		starpu_tag_t offloaded_mask = mask << last_currentbit;
-		unsigned current_index = 
+		unsigned current_index =
 			(tag & (offloaded_mask)) >> (last_currentbit);
 			(tag & (offloaded_mask)) >> (last_currentbit);
-		
-		current_htbl_ptr = 
-			current_htbl_ptr->children[current_index];
+
+		current_htbl_ptr_parent = 
+			&current_htbl_ptr->children[current_index];
+		current_htbl_ptr = *current_htbl_ptr_parent;
 	}
 	}
 
 
 	maxlevel = level;
 	maxlevel = level;
@@ -151,8 +154,9 @@ void *_starpu_htbl_remove_tag(starpu_htbl_node_t *htbl, starpu_tag_t tag)
 
 
 	void *old_entry = current_htbl_ptr;
 	void *old_entry = current_htbl_ptr;
 
 
-	if (tag_is_present) {
-		/* the tag was in the htbl, so we have to unroll the search 
+	if (tag_is_present)
+	{
+		/* the tag was in the htbl, so we have to unroll the search
  		 * to remove possibly useless htbl (internal) nodes */
  		 * to remove possibly useless htbl (internal) nodes */
 		for (level = maxlevel - 1; level >= 0; level--)
 		for (level = maxlevel - 1; level >= 0; level--)
 		{
 		{
@@ -166,10 +170,28 @@ void *_starpu_htbl_remove_tag(starpu_htbl_node_t *htbl, starpu_tag_t tag)
 				break;
 				break;
 
 
 			/* we remove this node */
 			/* we remove this node */
-			free(path[level]);
+			//free(path[level]);
+			*(path_parent[level]) = NULL;
 		}
 		}
 	}
 	}
 
 
 	/* we return the entry if there was one */
 	/* we return the entry if there was one */
 	return old_entry;
 	return old_entry;
 }
 }
+
+void _starpu_htbl_clear_tags(struct _starpu_htbl_node **htbl, unsigned level, void (*free_entry)(void *))
+{
+	unsigned i;
+	struct _starpu_htbl_node *tbl = *htbl;
+
+	if (!tbl)
+		return;
+
+	if (level * _STARPU_HTBL_NODE_SIZE < _STARPU_TAG_SIZE) {
+		for (i = 0; i < 1<<_STARPU_HTBL_NODE_SIZE; i++)
+			_starpu_htbl_clear_tags(&tbl->children[i], level + 1, free_entry);
+		free(tbl);
+	} else
+		free_entry(tbl);
+	*htbl = NULL;
+}

+ 11 - 9
src/core/dependencies/htable.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,16 +28,18 @@
 #include <assert.h>
 #include <assert.h>
 #include <core/dependencies/tags.h>
 #include <core/dependencies/tags.h>
 
 
-#define STARPU_HTBL_NODE_SIZE	16
+#define _STARPU_HTBL_NODE_SIZE	16
 
 
-typedef struct starpu_htbl_node_s {
+struct _starpu_htbl_node
+{
 	unsigned nentries;
 	unsigned nentries;
-	struct starpu_htbl_node_s *children[1<<STARPU_HTBL_NODE_SIZE];
-} starpu_htbl_node_t;
+	struct _starpu_htbl_node *children[1<<_STARPU_HTBL_NODE_SIZE];
+};
 
 
-void *_starpu_htbl_search_tag(starpu_htbl_node_t *htbl, starpu_tag_t tag);
-void *_starpu_htbl_insert_tag(starpu_htbl_node_t **htbl, starpu_tag_t tag, void *entry);
-void *_starpu_htbl_remove_tag(starpu_htbl_node_t *htbl, starpu_tag_t tag);
+void *_starpu_htbl_search_tag(struct _starpu_htbl_node *htbl, starpu_tag_t tag);
+void *_starpu_htbl_insert_tag(struct _starpu_htbl_node **htbl, starpu_tag_t tag, void *entry);
+void *_starpu_htbl_remove_tag(struct _starpu_htbl_node **htbl, starpu_tag_t tag);
+void _starpu_htbl_clear_tags(struct _starpu_htbl_node **htbl, unsigned level, void (*free_entry)(void*));
 
 
 
 
 #endif
 #endif

+ 125 - 106
src/core/dependencies/implicit_data_deps.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,21 +28,21 @@
 #endif
 #endif
 
 
 /* Read after Write (RAW) or Read after Read (RAR) */
 /* Read after Write (RAW) or Read after Read (RAR) */
-static void _starpu_add_reader_after_writer(starpu_data_handle handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
+static void _starpu_add_reader_after_writer(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
 {
 {
 	/* Add this task to the list of readers */
 	/* Add this task to the list of readers */
-	struct starpu_task_wrapper_list *link = (struct starpu_task_wrapper_list *) malloc(sizeof(struct starpu_task_wrapper_list));
+	struct _starpu_task_wrapper_list *link = (struct _starpu_task_wrapper_list *) malloc(sizeof(struct _starpu_task_wrapper_list));
 	link->task = post_sync_task;
 	link->task = post_sync_task;
 	link->next = handle->last_submitted_readers;
 	link->next = handle->last_submitted_readers;
 	handle->last_submitted_readers = link;
 	handle->last_submitted_readers = link;
 
 
 	/* This task depends on the previous writer if any */
 	/* This task depends on the previous writer if any */
-	if (handle->last_submitted_writer)
+	if (handle->last_submitted_writer && handle->last_submitted_writer != post_sync_task)
 	{
 	{
 		_STARPU_DEP_DEBUG("RAW %p\n", handle);
 		_STARPU_DEP_DEBUG("RAW %p\n", handle);
 		struct starpu_task *task_array[1] = {handle->last_submitted_writer};
 		struct starpu_task *task_array[1] = {handle->last_submitted_writer};
 		_STARPU_DEP_DEBUG("dep %p -> %p\n", handle->last_submitted_writer, pre_sync_task);
 		_STARPU_DEP_DEBUG("dep %p -> %p\n", handle->last_submitted_writer, pre_sync_task);
-		starpu_task_declare_deps_array(pre_sync_task, 1, task_array);
+		_starpu_task_declare_deps_array(pre_sync_task, 1, task_array, 0);
 	}
 	}
         else
         else
         {
         {
@@ -58,57 +58,67 @@ static void _starpu_add_reader_after_writer(starpu_data_handle handle, struct st
 #endif
 #endif
 		handle->last_submitted_ghost_writer_id_is_valid)
 		handle->last_submitted_ghost_writer_id_is_valid)
 	{
 	{
-		starpu_job_t pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
-		STARPU_TRACE_GHOST_TASK_DEPS(handle->last_submitted_ghost_writer_id, pre_sync_job->job_id);
+		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
+		_STARPU_TRACE_GHOST_TASK_DEPS(handle->last_submitted_ghost_writer_id, pre_sync_job->job_id);
 		_starpu_bound_job_id_dep(pre_sync_job, handle->last_submitted_ghost_writer_id);
 		_starpu_bound_job_id_dep(pre_sync_job, handle->last_submitted_ghost_writer_id);
 		_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", handle->last_submitted_ghost_writer_id, pre_sync_task);
 		_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", handle->last_submitted_ghost_writer_id, pre_sync_task);
 	}
 	}
+
+	if (!pre_sync_task->cl)
+		_starpu_get_job_associated_to_task(pre_sync_task)->implicit_dep_handle = handle;
 }
 }
 
 
 /* Write after Read (WAR) */
 /* Write after Read (WAR) */
-static void _starpu_add_writer_after_readers(starpu_data_handle handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
+static void _starpu_add_writer_after_readers(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
 {
 {
 	/* Count the readers */
 	/* Count the readers */
 	unsigned nreaders = 0;
 	unsigned nreaders = 0;
-	struct starpu_task_wrapper_list *l;
+	struct _starpu_task_wrapper_list *l;
 	l = handle->last_submitted_readers;
 	l = handle->last_submitted_readers;
 	while (l)
 	while (l)
 	{
 	{
-		nreaders++;
+		if (l->task != post_sync_task)
+			nreaders++;
 		l = l->next;
 		l = l->next;
 	}
 	}
 	_STARPU_DEP_DEBUG("%d readers\n", nreaders);
 	_STARPU_DEP_DEBUG("%d readers\n", nreaders);
 
 
-	/* Put all tasks in the list into task_array */
-	struct starpu_task *task_array[nreaders];
-	unsigned i = 0;
-	l = handle->last_submitted_readers;
-	while (l)
+	if (nreaders > 0)
 	{
 	{
-		STARPU_ASSERT(l->task);
-		task_array[i++] = l->task;
-		_STARPU_DEP_DEBUG("dep %p -> %p\n", l->task, pre_sync_task);
+		/* Put all tasks in the list into task_array */
+		struct starpu_task *task_array[nreaders];
+		unsigned i = 0;
+		l = handle->last_submitted_readers;
+		while (l)
+		{
+			STARPU_ASSERT(l->task);
+			if (l->task != post_sync_task) {
+				task_array[i++] = l->task;
+				_STARPU_DEP_DEBUG("dep %p -> %p\n", l->task, pre_sync_task);
+			}
 
 
-		struct starpu_task_wrapper_list *prev = l;
-		l = l->next;
-		free(prev);
+			struct _starpu_task_wrapper_list *prev = l;
+			l = l->next;
+			free(prev);
+		}
+		_starpu_task_declare_deps_array(pre_sync_task, nreaders, task_array, 0);
 	}
 	}
 #ifndef STARPU_USE_FXT
 #ifndef STARPU_USE_FXT
 	if (_starpu_bound_recording)
 	if (_starpu_bound_recording)
 #endif
 #endif
 	{
 	{
 		/* Declare all dependencies with ghost readers */
 		/* Declare all dependencies with ghost readers */
-		starpu_job_t pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
+		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
 
 
-		struct starpu_jobid_list *ghost_readers_id = handle->last_submitted_ghost_readers_id;
+		struct _starpu_jobid_list *ghost_readers_id = handle->last_submitted_ghost_readers_id;
 		while (ghost_readers_id)
 		while (ghost_readers_id)
 		{
 		{
 			unsigned long id = ghost_readers_id->id;
 			unsigned long id = ghost_readers_id->id;
-			STARPU_TRACE_GHOST_TASK_DEPS(id, pre_sync_job->job_id);
+			_STARPU_TRACE_GHOST_TASK_DEPS(id, pre_sync_job->job_id);
 			_starpu_bound_job_id_dep(pre_sync_job, id);
 			_starpu_bound_job_id_dep(pre_sync_job, id);
 			_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", id, pre_sync_task);
 			_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", id, pre_sync_task);
 
 
-			struct starpu_jobid_list *prev = ghost_readers_id;
+			struct _starpu_jobid_list *prev = ghost_readers_id;
 			ghost_readers_id = ghost_readers_id->next;
 			ghost_readers_id = ghost_readers_id->next;
 			free(prev);
 			free(prev);
 		}
 		}
@@ -118,17 +128,19 @@ static void _starpu_add_writer_after_readers(starpu_data_handle handle, struct s
 	handle->last_submitted_readers = NULL;
 	handle->last_submitted_readers = NULL;
 	handle->last_submitted_writer = post_sync_task;
 	handle->last_submitted_writer = post_sync_task;
 
 
-	starpu_task_declare_deps_array(pre_sync_task, nreaders, task_array);
+	if (!post_sync_task->cl)
+		_starpu_get_job_associated_to_task(post_sync_task)->implicit_dep_handle = handle;
 }
 }
+
 /* Write after Write (WAW) */
 /* Write after Write (WAW) */
-static void _starpu_add_writer_after_writer(starpu_data_handle handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
+static void _starpu_add_writer_after_writer(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
 {
 {
 	/* (Read) Write */
 	/* (Read) Write */
 	/* This task depends on the previous writer */
 	/* This task depends on the previous writer */
-	if (handle->last_submitted_writer)
+	if (handle->last_submitted_writer && handle->last_submitted_writer != post_sync_task)
 	{
 	{
 		struct starpu_task *task_array[1] = {handle->last_submitted_writer};
 		struct starpu_task *task_array[1] = {handle->last_submitted_writer};
-		starpu_task_declare_deps_array(pre_sync_task, 1, task_array);
+		_starpu_task_declare_deps_array(pre_sync_task, 1, task_array, 0);
 		_STARPU_DEP_DEBUG("dep %p -> %p\n", handle->last_submitted_writer, pre_sync_task);
 		_STARPU_DEP_DEBUG("dep %p -> %p\n", handle->last_submitted_writer, pre_sync_task);
 	}
 	}
         else
         else
@@ -145,8 +157,8 @@ static void _starpu_add_writer_after_writer(starpu_data_handle handle, struct st
 	{
 	{
 		if (handle->last_submitted_ghost_writer_id_is_valid)
 		if (handle->last_submitted_ghost_writer_id_is_valid)
 		{
 		{
-			starpu_job_t pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
-			STARPU_TRACE_GHOST_TASK_DEPS(handle->last_submitted_ghost_writer_id, pre_sync_job->job_id);
+			struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
+			_STARPU_TRACE_GHOST_TASK_DEPS(handle->last_submitted_ghost_writer_id, pre_sync_job->job_id);
 			_starpu_bound_job_id_dep(pre_sync_job, handle->last_submitted_ghost_writer_id);
 			_starpu_bound_job_id_dep(pre_sync_job, handle->last_submitted_ghost_writer_id);
 			_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", handle->last_submitted_ghost_writer_id, pre_sync_task);
 			_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", handle->last_submitted_ghost_writer_id, pre_sync_task);
 			handle->last_submitted_ghost_writer_id_is_valid = 0;
 			handle->last_submitted_ghost_writer_id_is_valid = 0;
@@ -158,21 +170,11 @@ static void _starpu_add_writer_after_writer(starpu_data_handle handle, struct st
 	}
 	}
 
 
 	handle->last_submitted_writer = post_sync_task;
 	handle->last_submitted_writer = post_sync_task;
-}
 
 
-static void disable_last_writer_callback(void *cl_arg)
-{
-	starpu_data_handle handle = (starpu_data_handle) cl_arg;
-	
-	/* NB: we don't take the handle->sequential_consistency_mutex mutex
-	 * because the empty task that is used for synchronization is going to
-	 * be unlock in the context of a call to
-	 * _starpu_detect_implicit_data_deps_with_handle. It will therefore
-	 * already have been locked. */
-	handle->last_submitted_writer = NULL;
+	if (!post_sync_task->cl)
+		_starpu_get_job_associated_to_task(post_sync_task)->implicit_dep_handle = handle;
 }
 }
 
 
-
 /* This function adds the implicit task dependencies introduced by data
 /* This function adds the implicit task dependencies introduced by data
  * sequential consistency. Two tasks are provided: pre_sync and post_sync which
  * sequential consistency. Two tasks are provided: pre_sync and post_sync which
  * respectively indicates which task is going to depend on the previous deps
  * respectively indicates which task is going to depend on the previous deps
@@ -180,24 +182,26 @@ static void disable_last_writer_callback(void *cl_arg)
  * introduced by a task submission, both tasks are just the submitted task, but
  * introduced by a task submission, both tasks are just the submitted task, but
  * in the case of user interactions with the DSM, these may be different tasks.
  * in the case of user interactions with the DSM, these may be different tasks.
  * */
  * */
-/* NB : handle->sequential_consistency_mutex must be hold by the caller */
-void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
-						starpu_data_handle handle, starpu_access_mode mode)
+/* NB : handle->sequential_consistency_mutex must be hold by the caller;
+ * returns a task, to be submitted after releasing that mutex. */
+struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
+						   starpu_data_handle_t handle, enum starpu_access_mode mode)
 {
 {
+	struct starpu_task *task = NULL;
+
 	STARPU_ASSERT(!(mode & STARPU_SCRATCH));
 	STARPU_ASSERT(!(mode & STARPU_SCRATCH));
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 
 
 	if (handle->sequential_consistency)
 	if (handle->sequential_consistency)
 	{
 	{
-		starpu_job_t pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
-		starpu_job_t post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
+		struct _starpu_job *pre_sync_job = _starpu_get_job_associated_to_task(pre_sync_task);
+		struct _starpu_job *post_sync_job = _starpu_get_job_associated_to_task(post_sync_task);
 
 
 		/* Skip tasks that are associated to a reduction phase so that
 		/* Skip tasks that are associated to a reduction phase so that
 		 * they do not interfere with the application. */
 		 * they do not interfere with the application. */
 		if (pre_sync_job->reduction_task || post_sync_job->reduction_task)
 		if (pre_sync_job->reduction_task || post_sync_job->reduction_task)
-			return;
-	
-	
+			return NULL;
+
 		_STARPU_DEP_DEBUG("Tasks %p %p\n", pre_sync_task, post_sync_task);
 		_STARPU_DEP_DEBUG("Tasks %p %p\n", pre_sync_task, post_sync_task);
 		/* In case we are generating the DAG, we add an implicit
 		/* In case we are generating the DAG, we add an implicit
 		 * dependency between the pre and the post sync tasks in case
 		 * dependency between the pre and the post sync tasks in case
@@ -208,12 +212,12 @@ void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_
 #endif
 #endif
 		)
 		)
 		{
 		{
-			STARPU_TRACE_GHOST_TASK_DEPS(pre_sync_job->job_id, post_sync_job->job_id);
+			_STARPU_TRACE_GHOST_TASK_DEPS(pre_sync_job->job_id, post_sync_job->job_id);
 			_starpu_bound_task_dep(post_sync_job, pre_sync_job);
 			_starpu_bound_task_dep(post_sync_job, pre_sync_job);
 		}
 		}
 
 
-		starpu_access_mode previous_mode = handle->last_submitted_mode;
-	
+		enum starpu_access_mode previous_mode = handle->last_submitted_mode;
+
 		if (mode & STARPU_W)
 		if (mode & STARPU_W)
 		{
 		{
 			_STARPU_DEP_DEBUG("W %p\n", handle);
 			_STARPU_DEP_DEBUG("W %p\n", handle);
@@ -222,17 +226,17 @@ void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_
 				_STARPU_DEP_DEBUG("WAW %p\n", handle);
 				_STARPU_DEP_DEBUG("WAW %p\n", handle);
 				_starpu_add_writer_after_writer(handle, pre_sync_task, post_sync_task);
 				_starpu_add_writer_after_writer(handle, pre_sync_task, post_sync_task);
 			}
 			}
-			else {
+			else
+			{
 				/* The task submitted previously were in read-only
 				/* The task submitted previously were in read-only
 				 * mode: this task must depend on all those read-only
 				 * mode: this task must depend on all those read-only
 				 * tasks and we get rid of the list of readers */
 				 * tasks and we get rid of the list of readers */
-			
 				_STARPU_DEP_DEBUG("WAR %p\n", handle);
 				_STARPU_DEP_DEBUG("WAR %p\n", handle);
 				_starpu_add_writer_after_readers(handle, pre_sync_task, post_sync_task);
 				_starpu_add_writer_after_readers(handle, pre_sync_task, post_sync_task);
 			}
 			}
-	
 		}
 		}
-		else {
+		else
+		{
 			_STARPU_DEP_DEBUG("R %p %d -> %d\n", handle, previous_mode, mode);
 			_STARPU_DEP_DEBUG("R %p %d -> %d\n", handle, previous_mode, mode);
 			/* Add a reader, after a writer or a reader. */
 			/* Add a reader, after a writer or a reader. */
 			STARPU_ASSERT(pre_sync_task);
 			STARPU_ASSERT(pre_sync_task);
@@ -253,23 +257,20 @@ void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_
 				new_sync_task = starpu_task_create();
 				new_sync_task = starpu_task_create();
 				STARPU_ASSERT(new_sync_task);
 				STARPU_ASSERT(new_sync_task);
 				new_sync_task->cl = NULL;
 				new_sync_task->cl = NULL;
-				new_sync_task->callback_func = disable_last_writer_callback;
-				new_sync_task->callback_arg = handle;
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 				_starpu_get_job_associated_to_task(new_sync_task)->model_name = "sync_task_redux";
 				_starpu_get_job_associated_to_task(new_sync_task)->model_name = "sync_task_redux";
 #endif
 #endif
 
 
 				_starpu_add_writer_after_readers(handle, new_sync_task, new_sync_task);
 				_starpu_add_writer_after_readers(handle, new_sync_task, new_sync_task);
 
 
-				_starpu_task_submit_internal(new_sync_task);
+				task = new_sync_task;
 			}
 			}
-	
 			_starpu_add_reader_after_writer(handle, pre_sync_task, post_sync_task);
 			_starpu_add_reader_after_writer(handle, pre_sync_task, post_sync_task);
 		}
 		}
-	
 		handle->last_submitted_mode = mode;
 		handle->last_submitted_mode = mode;
 	}
 	}
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
+	return task;
 }
 }
 
 
 /* Create the implicit dependencies for a newly submitted task */
 /* Create the implicit dependencies for a newly submitted task */
@@ -280,7 +281,7 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 
 
 	/* We don't want to enforce a sequential consistency for tasks that are
 	/* We don't want to enforce a sequential consistency for tasks that are
 	 * not visible to the application. */
 	 * not visible to the application. */
-	starpu_job_t j = _starpu_get_job_associated_to_task(task);
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	if (j->reduction_task)
 	if (j->reduction_task)
 		return;
 		return;
 
 
@@ -289,16 +290,21 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 	unsigned buffer;
 	unsigned buffer;
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 	{
-		starpu_data_handle handle = task->buffers[buffer].handle;
-		starpu_access_mode mode = task->buffers[buffer].mode;
+		starpu_data_handle_t handle = task->handles[buffer];
+		enum starpu_access_mode mode = task->cl->modes[buffer];
+		struct starpu_task *new_task;
 
 
 		/* Scratch memory does not introduce any deps */
 		/* Scratch memory does not introduce any deps */
 		if (mode & STARPU_SCRATCH)
 		if (mode & STARPU_SCRATCH)
 			continue;
 			continue;
 
 
-		PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
-		_starpu_detect_implicit_data_deps_with_handle(task, task, handle, mode);
-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(task, task, handle, mode);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+		if (new_task) {
+			int ret = starpu_task_submit_internal(new_task);
+			STARPU_ASSERT(!ret);
+		}
 	}
 	}
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
 }
 }
@@ -311,9 +317,10 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
  * sequence, f(Ar) g(Ar) h(Aw), we expect to have h depend on both f and g, but
  * sequence, f(Ar) g(Ar) h(Aw), we expect to have h depend on both f and g, but
  * if h is submitted after the termination of f or g, StarPU will not create a
  * if h is submitted after the termination of f or g, StarPU will not create a
  * dependency as this is not needed anymore. */
  * dependency as this is not needed anymore. */
-void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle handle)
+/* the sequential_consistency_mutex of the handle has to be already held */
+void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle)
 {
 {
-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 
 
 	if (handle->sequential_consistency)
 	if (handle->sequential_consistency)
 	{
 	{
@@ -323,19 +330,18 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 		if (task == handle->last_submitted_writer)
 		if (task == handle->last_submitted_writer)
 		{
 		{
 			handle->last_submitted_writer = NULL;
 			handle->last_submitted_writer = NULL;
-			
+
 #ifndef STARPU_USE_FXT
 #ifndef STARPU_USE_FXT
 			if (_starpu_bound_recording)
 			if (_starpu_bound_recording)
 #endif
 #endif
 			{
 			{
 				/* Save the previous writer as the ghost last writer */
 				/* Save the previous writer as the ghost last writer */
 				handle->last_submitted_ghost_writer_id_is_valid = 1;
 				handle->last_submitted_ghost_writer_id_is_valid = 1;
-				starpu_job_t ghost_job = _starpu_get_job_associated_to_task(task);
+				struct _starpu_job *ghost_job = _starpu_get_job_associated_to_task(task);
 				handle->last_submitted_ghost_writer_id = ghost_job->job_id;
 				handle->last_submitted_ghost_writer_id = ghost_job->job_id;
 			}
 			}
-			
 		}
 		}
-		
+
 		/* XXX can a task be both the last writer associated to a data
 		/* XXX can a task be both the last writer associated to a data
 		 * and be in its list of readers ? If not, we should not go
 		 * and be in its list of readers ? If not, we should not go
 		 * through the entire list once we have detected it was the
 		 * through the entire list once we have detected it was the
@@ -343,12 +349,15 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 
 
 		/* Same if this is one of the readers: we go through the list
 		/* Same if this is one of the readers: we go through the list
 		 * of readers and remove the task if it is found. */
 		 * of readers and remove the task if it is found. */
-		struct starpu_task_wrapper_list *l;
+		struct _starpu_task_wrapper_list *l;
 		l = handle->last_submitted_readers;
 		l = handle->last_submitted_readers;
-		struct starpu_task_wrapper_list *prev = NULL;
+		struct _starpu_task_wrapper_list *prev = NULL;
+#ifdef STARPU_DEVEL
+#warning TODO: use double-linked list to make finding ourself fast
+#endif
 		while (l)
 		while (l)
 		{
 		{
-			struct starpu_task_wrapper_list *next = l->next;
+			struct _starpu_task_wrapper_list *next = l->next;
 
 
 			if (l->task == task)
 			if (l->task == task)
 			{
 			{
@@ -360,11 +369,11 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 #endif
 #endif
 				{
 				{
 					/* Save the job id of the reader task in the ghost reader linked list list */
 					/* Save the job id of the reader task in the ghost reader linked list list */
-					starpu_job_t ghost_reader_job = _starpu_get_job_associated_to_task(task);
-					struct starpu_jobid_list *link = (struct starpu_jobid_list *) malloc(sizeof(struct starpu_jobid_list));
+					struct _starpu_job *ghost_reader_job = _starpu_get_job_associated_to_task(task);
+					struct _starpu_jobid_list *link = (struct _starpu_jobid_list *) malloc(sizeof(struct _starpu_jobid_list));
 					STARPU_ASSERT(link);
 					STARPU_ASSERT(link);
 					link->next = handle->last_submitted_ghost_readers_id;
 					link->next = handle->last_submitted_ghost_readers_id;
-					link->id = ghost_reader_job->job_id; 
+					link->id = ghost_reader_job->job_id;
 					handle->last_submitted_ghost_readers_id = link;
 					handle->last_submitted_ghost_readers_id = link;
 				}
 				}
 
 
@@ -372,7 +381,8 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 				{
 				{
 					prev->next = next;
 					prev->next = next;
 				}
 				}
-				else {
+				else
+				{
 					/* This is the first element of the list */
 					/* This is the first element of the list */
 					handle->last_submitted_readers = next;
 					handle->last_submitted_readers = next;
 				}
 				}
@@ -383,7 +393,8 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 				 * as soon as we find the task. TODO: check how
 				 * as soon as we find the task. TODO: check how
 				 * duplicate dependencies are treated. */
 				 * duplicate dependencies are treated. */
 			}
 			}
-			else {
+			else
+			{
 				prev = l;
 				prev = l;
 			}
 			}
 
 
@@ -391,34 +402,34 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 		}
 		}
 	}
 	}
 
 
-	PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 }
 }
 
 
-void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle handle)
+void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle)
 {
 {
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 
 
 	if (handle->sequential_consistency)
 	if (handle->sequential_consistency)
 	{
 	{
 		handle->post_sync_tasks_cnt++;
 		handle->post_sync_tasks_cnt++;
 
 
-		struct starpu_task_wrapper_list *link = (struct starpu_task_wrapper_list *) malloc(sizeof(struct starpu_task_wrapper_list));
+		struct _starpu_task_wrapper_list *link = (struct _starpu_task_wrapper_list *) malloc(sizeof(struct _starpu_task_wrapper_list));
 		link->task = post_sync_task;
 		link->task = post_sync_task;
 		link->next = handle->post_sync_tasks;
 		link->next = handle->post_sync_tasks;
-		handle->post_sync_tasks = link;		
+		handle->post_sync_tasks = link;
 	}
 	}
 
 
-	PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
 }
 }
 
 
-void _starpu_unlock_post_sync_tasks(starpu_data_handle handle)
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 {
 {
-	struct starpu_task_wrapper_list *post_sync_tasks = NULL;
+	struct _starpu_task_wrapper_list *post_sync_tasks = NULL;
 	unsigned do_submit_tasks = 0;
 	unsigned do_submit_tasks = 0;
 
 
-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 
 
 	if (handle->sequential_consistency)
 	if (handle->sequential_consistency)
 	{
 	{
@@ -431,36 +442,38 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle handle)
 			post_sync_tasks = handle->post_sync_tasks;
 			post_sync_tasks = handle->post_sync_tasks;
 			handle->post_sync_tasks = NULL;
 			handle->post_sync_tasks = NULL;
 		}
 		}
-
 	}
 	}
 
 
-	PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 
 	if (do_submit_tasks)
 	if (do_submit_tasks)
 	{
 	{
-		struct starpu_task_wrapper_list *link = post_sync_tasks;
+		struct _starpu_task_wrapper_list *link = post_sync_tasks;
 
 
-		while (link) {
+		while (link)
+		{
 			/* There is no need to depend on that task now, since it was already unlocked */
 			/* There is no need to depend on that task now, since it was already unlocked */
 			_starpu_release_data_enforce_sequential_consistency(link->task, handle);
 			_starpu_release_data_enforce_sequential_consistency(link->task, handle);
 
 
-			int ret = _starpu_task_submit_internal(link->task);
+			int ret = starpu_task_submit_internal(link->task);
 			STARPU_ASSERT(!ret);
 			STARPU_ASSERT(!ret);
+			struct _starpu_task_wrapper_list *tmp = link;
 			link = link->next;
 			link = link->next;
+			free(tmp);
 		}
 		}
 	}
 	}
 }
 }
 
 
 /* If sequential consistency mode is enabled, this function blocks until the
 /* If sequential consistency mode is enabled, this function blocks until the
  * handle is available in the requested access mode. */
  * handle is available in the requested access mode. */
-int _starpu_data_wait_until_available(starpu_data_handle handle, starpu_access_mode mode)
+int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_access_mode mode)
 {
 {
 	/* If sequential consistency is enabled, wait until data is available */
 	/* If sequential consistency is enabled, wait until data is available */
-	PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&handle->sequential_consistency_mutex);
 	int sequential_consistency = handle->sequential_consistency;
 	int sequential_consistency = handle->sequential_consistency;
 	if (sequential_consistency)
 	if (sequential_consistency)
 	{
 	{
-		struct starpu_task *sync_task;
+		struct starpu_task *sync_task, *new_task;
 		sync_task = starpu_task_create();
 		sync_task = starpu_task_create();
 		sync_task->detach = 0;
 		sync_task->detach = 0;
 		sync_task->destroy = 1;
 		sync_task->destroy = 1;
@@ -470,16 +483,22 @@ int _starpu_data_wait_until_available(starpu_data_handle handle, starpu_access_m
 
 
 		/* It is not really a RW access, but we want to make sure that
 		/* It is not really a RW access, but we want to make sure that
 		 * all previous accesses are done */
 		 * all previous accesses are done */
-		_starpu_detect_implicit_data_deps_with_handle(sync_task, sync_task, handle, mode);
-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+		new_task = _starpu_detect_implicit_data_deps_with_handle(sync_task, sync_task, handle, mode);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+
+		if (new_task) {
+			int ret = starpu_task_submit_internal(new_task);
+			STARPU_ASSERT(!ret);
+		}
 
 
 		/* TODO detect if this is superflous */
 		/* TODO detect if this is superflous */
-		int ret = _starpu_task_submit_internal(sync_task);
+		int ret = starpu_task_submit_internal(sync_task);
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 		starpu_task_wait(sync_task);
 		starpu_task_wait(sync_task);
 	}
 	}
-	else {
-		PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
+	else
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 	}
 	}
 
 
 	return 0;
 	return 0;

+ 8 - 8
src/core/dependencies/implicit_data_deps.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,16 +21,16 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <common/config.h>
 
 
-void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
-						starpu_data_handle handle, starpu_access_mode mode);
+struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
+						   starpu_data_handle_t handle, enum starpu_access_mode mode);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
-void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle handle);
+void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle);
 
 
-void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle handle);
-void _starpu_unlock_post_sync_tasks(starpu_data_handle handle);
+void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
+void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
 
 
 /* This function blocks until the handle is available in the requested mode */
 /* This function blocks until the handle is available in the requested mode */
-int _starpu_data_wait_until_available(starpu_data_handle handle, starpu_access_mode mode);
+int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_access_mode mode);
 
 
 #endif // __IMPLICIT_DATA_DEPS_H__
 #endif // __IMPLICIT_DATA_DEPS_H__
 
 

+ 95 - 76
src/core/dependencies/tags.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,12 +25,12 @@
 #include <core/dependencies/data_concurrency.h>
 #include <core/dependencies/data_concurrency.h>
 #include <profiling/bound.h>
 #include <profiling/bound.h>
 
 
-static starpu_htbl_node_t *tag_htbl = NULL;
+static struct _starpu_htbl_node *tag_htbl = NULL;
 static pthread_rwlock_t tag_global_rwlock = PTHREAD_RWLOCK_INITIALIZER;
 static pthread_rwlock_t tag_global_rwlock = PTHREAD_RWLOCK_INITIALIZER;
 
 
-static starpu_cg_t *create_cg_apps(unsigned ntags)
+static struct _starpu_cg *create_cg_apps(unsigned ntags)
 {
 {
-	starpu_cg_t *cg = (starpu_cg_t *) malloc(sizeof(starpu_cg_t));
+	struct _starpu_cg *cg = (struct _starpu_cg *) malloc(sizeof(struct _starpu_cg));
 	STARPU_ASSERT(cg);
 	STARPU_ASSERT(cg);
 
 
 	cg->ntags = ntags;
 	cg->ntags = ntags;
@@ -38,16 +38,15 @@ static starpu_cg_t *create_cg_apps(unsigned ntags)
 	cg->cg_type = STARPU_CG_APPS;
 	cg->cg_type = STARPU_CG_APPS;
 
 
 	cg->succ.succ_apps.completed = 0;
 	cg->succ.succ_apps.completed = 0;
-	PTHREAD_MUTEX_INIT(&cg->succ.succ_apps.cg_mutex, NULL);
-	PTHREAD_COND_INIT(&cg->succ.succ_apps.cg_cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&cg->succ.succ_apps.cg_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&cg->succ.succ_apps.cg_cond, NULL);
 
 
 	return cg;
 	return cg;
 }
 }
 
 
-
-static starpu_cg_t *create_cg_tag(unsigned ntags, struct starpu_tag_s *tag)
+static struct _starpu_cg *create_cg_tag(unsigned ntags, struct _starpu_tag *tag)
 {
 {
-	starpu_cg_t *cg = (starpu_cg_t *) malloc(sizeof(starpu_cg_t));
+	struct _starpu_cg *cg = (struct _starpu_cg *) malloc(sizeof(struct _starpu_cg));
 	STARPU_ASSERT(cg);
 	STARPU_ASSERT(cg);
 
 
 	cg->ntags = ntags;
 	cg->ntags = ntags;
@@ -60,10 +59,10 @@ static starpu_cg_t *create_cg_tag(unsigned ntags, struct starpu_tag_s *tag)
 	return cg;
 	return cg;
 }
 }
 
 
-static struct starpu_tag_s *_starpu_tag_init(starpu_tag_t id)
+static struct _starpu_tag *_starpu_tag_init(starpu_tag_t id)
 {
 {
-	struct starpu_tag_s *tag;
-	tag = (struct starpu_tag_s *) malloc(sizeof(struct starpu_tag_s));
+	struct _starpu_tag *tag;
+	tag = (struct _starpu_tag *) malloc(sizeof(struct _starpu_tag));
 	STARPU_ASSERT(tag);
 	STARPU_ASSERT(tag);
 
 
 	tag->job = NULL;
 	tag->job = NULL;
@@ -80,15 +79,9 @@ static struct starpu_tag_s *_starpu_tag_init(starpu_tag_t id)
 	return tag;
 	return tag;
 }
 }
 
 
-void starpu_tag_remove(starpu_tag_t id)
+static void _starpu_tag_free(void *_tag)
 {
 {
-	struct starpu_tag_s *tag;
-
-	pthread_rwlock_wrlock(&tag_global_rwlock);
-
-	tag = (struct starpu_tag_s *) _starpu_htbl_remove_tag(tag_htbl, id);
-
-	pthread_rwlock_unlock(&tag_global_rwlock);
+	struct _starpu_tag *tag = (struct _starpu_tag *) _tag;
 
 
 	if (tag) {
 	if (tag) {
 		_starpu_spin_lock(&tag->lock);
 		_starpu_spin_lock(&tag->lock);
@@ -98,7 +91,7 @@ void starpu_tag_remove(starpu_tag_t id)
 
 
 		for (succ = 0; succ < nsuccs; succ++)
 		for (succ = 0; succ < nsuccs; succ++)
 		{
 		{
-			struct starpu_cg_s *cg = tag->tag_successors.succ[succ];
+			struct _starpu_cg *cg = tag->tag_successors.succ[succ];
 
 
 			unsigned ntags = STARPU_ATOMIC_ADD(&cg->ntags, -1);
 			unsigned ntags = STARPU_ATOMIC_ADD(&cg->ntags, -1);
 			unsigned remaining __attribute__ ((unused)) = STARPU_ATOMIC_ADD(&cg->remaining, -1);
 			unsigned remaining __attribute__ ((unused)) = STARPU_ATOMIC_ADD(&cg->remaining, -1);
@@ -113,20 +106,43 @@ void starpu_tag_remove(starpu_tag_t id)
 #endif
 #endif
 
 
 		_starpu_spin_unlock(&tag->lock);
 		_starpu_spin_unlock(&tag->lock);
+
+		free(tag);
 	}
 	}
+}
+
+void starpu_tag_remove(starpu_tag_t id)
+{
+	struct _starpu_tag *tag;
+
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
+
+	tag = (struct _starpu_tag *) _starpu_htbl_remove_tag(&tag_htbl, id);
 
 
-	free(tag);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
+
+	_starpu_tag_free(tag);
+}
+
+void _starpu_tag_clear(void)
+{
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
+
+	_starpu_htbl_clear_tags(&tag_htbl, 0, _starpu_tag_free);
+
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 }
 }
 
 
-static struct starpu_tag_s *gettag_struct(starpu_tag_t id)
+static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 {
 {
-	pthread_rwlock_wrlock(&tag_global_rwlock);
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
 
 
 	/* search if the tag is already declared or not */
 	/* search if the tag is already declared or not */
-	struct starpu_tag_s *tag;
-	tag = (struct starpu_tag_s *) _starpu_htbl_search_tag(tag_htbl, id);
+	struct _starpu_tag *tag;
+	tag = (struct _starpu_tag *) _starpu_htbl_search_tag(tag_htbl, id);
 
 
-	if (tag == NULL) {
+	if (tag == NULL)
+	{
 		/* the tag does not exist yet : create an entry */
 		/* the tag does not exist yet : create an entry */
 		tag = _starpu_tag_init(id);
 		tag = _starpu_tag_init(id);
 
 
@@ -136,18 +152,18 @@ static struct starpu_tag_s *gettag_struct(starpu_tag_t id)
 		STARPU_ASSERT(old == NULL);
 		STARPU_ASSERT(old == NULL);
 	}
 	}
 
 
-	pthread_rwlock_unlock(&tag_global_rwlock);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
 
 
 	return tag;
 	return tag;
 }
 }
 
 
 /* lock should be taken */
 /* lock should be taken */
-void _starpu_tag_set_ready(struct starpu_tag_s *tag)
+void _starpu_tag_set_ready(struct _starpu_tag *tag)
 {
 {
 	/* mark this tag as ready to run */
 	/* mark this tag as ready to run */
 	tag->state = STARPU_READY;
 	tag->state = STARPU_READY;
 	/* declare it to the scheduler ! */
 	/* declare it to the scheduler ! */
-	struct starpu_job_s *j = tag->job;
+	struct _starpu_job *j = tag->job;
 
 
 	/* In case the task job is going to be scheduled immediately, and if
 	/* In case the task job is going to be scheduled immediately, and if
 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
 	 * the task is "empty", calling _starpu_push_task would directly try to enforce
@@ -155,35 +171,37 @@ void _starpu_tag_set_ready(struct starpu_tag_s *tag)
 	 * lock again, resulting in a deadlock. */
 	 * lock again, resulting in a deadlock. */
 	_starpu_spin_unlock(&tag->lock);
 	_starpu_spin_unlock(&tag->lock);
 
 
-	PTHREAD_MUTEX_LOCK(&j->sync_mutex);
-
 	/* enforce data dependencies */
 	/* enforce data dependencies */
-	_starpu_enforce_deps_starting_from_task(j, 1);
-
-	PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+	_starpu_enforce_deps_starting_from_task(j);
 
 
 	_starpu_spin_lock(&tag->lock);
 	_starpu_spin_lock(&tag->lock);
 }
 }
 
 
 /* the lock must be taken ! */
 /* the lock must be taken ! */
-static void _starpu_tag_add_succ(struct starpu_tag_s *tag, starpu_cg_t *cg)
+static void _starpu_tag_add_succ(struct _starpu_tag *tag, struct _starpu_cg *cg)
 {
 {
 	STARPU_ASSERT(tag);
 	STARPU_ASSERT(tag);
 
 
 	_starpu_add_successor_to_cg_list(&tag->tag_successors, cg);
 	_starpu_add_successor_to_cg_list(&tag->tag_successors, cg);
 
 
-	if (tag->state == STARPU_DONE) {
+	if (tag->state == STARPU_DONE)
+	{
 		/* the tag was already completed sooner */
 		/* the tag was already completed sooner */
 		_starpu_notify_cg(cg);
 		_starpu_notify_cg(cg);
 	}
 	}
 }
 }
 
 
-void _starpu_notify_tag_dependencies(struct starpu_tag_s *tag)
+void _starpu_notify_tag_dependencies(struct _starpu_tag *tag)
 {
 {
 	_starpu_spin_lock(&tag->lock);
 	_starpu_spin_lock(&tag->lock);
 
 
+	if (tag->state == STARPU_DONE) {
+		_starpu_spin_unlock(&tag->lock);
+		return;
+	}
+
 	tag->state = STARPU_DONE;
 	tag->state = STARPU_DONE;
-	STARPU_TRACE_TAG_DONE(tag);
+	_STARPU_TRACE_TAG_DONE(tag);
 
 
 	_starpu_notify_cg_list(&tag->tag_successors);
 	_starpu_notify_cg_list(&tag->tag_successors);
 
 
@@ -192,20 +210,20 @@ void _starpu_notify_tag_dependencies(struct starpu_tag_s *tag)
 
 
 void starpu_tag_notify_from_apps(starpu_tag_t id)
 void starpu_tag_notify_from_apps(starpu_tag_t id)
 {
 {
-	struct starpu_tag_s *tag = gettag_struct(id);
+	struct _starpu_tag *tag = gettag_struct(id);
 
 
 	_starpu_notify_tag_dependencies(tag);
 	_starpu_notify_tag_dependencies(tag);
 }
 }
 
 
-void _starpu_tag_declare(starpu_tag_t id, struct starpu_job_s *job)
+void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job)
 {
 {
-	STARPU_TRACE_TAG(id, job);
+	_STARPU_TRACE_TAG(id, job);
 	job->task->use_tag = 1;
 	job->task->use_tag = 1;
-	
-	struct starpu_tag_s *tag= gettag_struct(id);
+
+	struct _starpu_tag *tag= gettag_struct(id);
 	tag->job = job;
 	tag->job = job;
 	tag->is_assigned = 1;
 	tag->is_assigned = 1;
-	
+
 	job->tag = tag;
 	job->tag = tag;
 
 
 	/* the tag is now associated to a job */
 	/* the tag is now associated to a job */
@@ -219,65 +237,65 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 	unsigned i;
 	unsigned i;
 
 
 	/* create the associated completion group */
 	/* create the associated completion group */
-	struct starpu_tag_s *tag_child = gettag_struct(id);
+	struct _starpu_tag *tag_child = gettag_struct(id);
 
 
 	_starpu_spin_lock(&tag_child->lock);
 	_starpu_spin_lock(&tag_child->lock);
-
-	starpu_cg_t *cg = create_cg_tag(ndeps, tag_child);
+	struct _starpu_cg *cg = create_cg_tag(ndeps, tag_child);
+	_starpu_spin_unlock(&tag_child->lock);
 
 
 	STARPU_ASSERT(ndeps != 0);
 	STARPU_ASSERT(ndeps != 0);
-	
+
 	for (i = 0; i < ndeps; i++)
 	for (i = 0; i < ndeps; i++)
 	{
 	{
 		starpu_tag_t dep_id = array[i];
 		starpu_tag_t dep_id = array[i];
-		
+
 		/* id depends on dep_id
 		/* id depends on dep_id
 		 * so cg should be among dep_id's successors*/
 		 * so cg should be among dep_id's successors*/
-		STARPU_TRACE_TAG_DEPS(id, dep_id);
+		_STARPU_TRACE_TAG_DEPS(id, dep_id);
 		_starpu_bound_tag_dep(id, dep_id);
 		_starpu_bound_tag_dep(id, dep_id);
-		struct starpu_tag_s *tag_dep = gettag_struct(dep_id);
+		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
 		_starpu_spin_lock(&tag_dep->lock);
+		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		_starpu_tag_add_succ(tag_dep, cg);
+		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 	}
-
-	_starpu_spin_unlock(&tag_child->lock);
 }
 }
 
 
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 {
 {
 	unsigned i;
 	unsigned i;
-	
+
 	/* create the associated completion group */
 	/* create the associated completion group */
-	struct starpu_tag_s *tag_child = gettag_struct(id);
+	struct _starpu_tag *tag_child = gettag_struct(id);
 
 
 	_starpu_spin_lock(&tag_child->lock);
 	_starpu_spin_lock(&tag_child->lock);
-
-	starpu_cg_t *cg = create_cg_tag(ndeps, tag_child);
+	struct _starpu_cg *cg = create_cg_tag(ndeps, tag_child);
+	_starpu_spin_unlock(&tag_child->lock);
 
 
 	STARPU_ASSERT(ndeps != 0);
 	STARPU_ASSERT(ndeps != 0);
-	
+
 	va_list pa;
 	va_list pa;
 	va_start(pa, ndeps);
 	va_start(pa, ndeps);
 	for (i = 0; i < ndeps; i++)
 	for (i = 0; i < ndeps; i++)
 	{
 	{
 		starpu_tag_t dep_id;
 		starpu_tag_t dep_id;
 		dep_id = va_arg(pa, starpu_tag_t);
 		dep_id = va_arg(pa, starpu_tag_t);
-	
+
 		/* id depends on dep_id
 		/* id depends on dep_id
 		 * so cg should be among dep_id's successors*/
 		 * so cg should be among dep_id's successors*/
-		STARPU_TRACE_TAG_DEPS(id, dep_id);
+		_STARPU_TRACE_TAG_DEPS(id, dep_id);
 		_starpu_bound_tag_dep(id, dep_id);
 		_starpu_bound_tag_dep(id, dep_id);
-		struct starpu_tag_s *tag_dep = gettag_struct(dep_id);
+		struct _starpu_tag *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
 		_starpu_spin_lock(&tag_dep->lock);
+		_starpu_spin_lock(&tag_child->lock);
 		_starpu_tag_add_succ(tag_dep, cg);
 		_starpu_tag_add_succ(tag_dep, cg);
+		_starpu_spin_unlock(&tag_child->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 		_starpu_spin_unlock(&tag_dep->lock);
 	}
 	}
 	va_end(pa);
 	va_end(pa);
-
-	_starpu_spin_unlock(&tag_child->lock);
 }
 }
 
 
 /* this function may be called by the application (outside callbacks !) */
 /* this function may be called by the application (outside callbacks !) */
@@ -286,12 +304,13 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 	unsigned i;
 	unsigned i;
 	unsigned current;
 	unsigned current;
 
 
-	struct starpu_tag_s *tag_array[ntags];
+	struct _starpu_tag *tag_array[ntags];
 
 
 	_STARPU_LOG_IN();
 	_STARPU_LOG_IN();
 
 
 	/* It is forbidden to block within callbacks or codelets */
 	/* It is forbidden to block within callbacks or codelets */
-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) {
+	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+	{
 		_STARPU_LOG_OUT_TAG("edeadlk");
 		_STARPU_LOG_OUT_TAG("edeadlk");
 		return -EDEADLK;
 		return -EDEADLK;
 	}
 	}
@@ -299,8 +318,8 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 	/* only wait the tags that are not done yet */
 	/* only wait the tags that are not done yet */
 	for (i = 0, current = 0; i < ntags; i++)
 	for (i = 0, current = 0; i < ntags; i++)
 	{
 	{
-		struct starpu_tag_s *tag = gettag_struct(id[i]);
-		
+		struct _starpu_tag *tag = gettag_struct(id[i]);
+
 		_starpu_spin_lock(&tag->lock);
 		_starpu_spin_lock(&tag->lock);
 
 
 		if (tag->state == STARPU_DONE)
 		if (tag->state == STARPU_DONE)
@@ -321,9 +340,9 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 		_STARPU_LOG_OUT_TAG("all deps are already fulfilled");
 		_STARPU_LOG_OUT_TAG("all deps are already fulfilled");
 		return 0;
 		return 0;
 	}
 	}
-	
+
 	/* there is at least one task that is not finished */
 	/* there is at least one task that is not finished */
-	starpu_cg_t *cg = create_cg_apps(current);
+	struct _starpu_cg *cg = create_cg_apps(current);
 
 
 	for (i = 0; i < current; i++)
 	for (i = 0; i < current; i++)
 	{
 	{
@@ -331,15 +350,15 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 		_starpu_spin_unlock(&tag_array[i]->lock);
 		_starpu_spin_unlock(&tag_array[i]->lock);
 	}
 	}
 
 
-	PTHREAD_MUTEX_LOCK(&cg->succ.succ_apps.cg_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&cg->succ.succ_apps.cg_mutex);
 
 
 	while (!cg->succ.succ_apps.completed)
 	while (!cg->succ.succ_apps.completed)
-		PTHREAD_COND_WAIT(&cg->succ.succ_apps.cg_cond, &cg->succ.succ_apps.cg_mutex);
+		_STARPU_PTHREAD_COND_WAIT(&cg->succ.succ_apps.cg_cond, &cg->succ.succ_apps.cg_mutex);
 
 
-	PTHREAD_MUTEX_UNLOCK(&cg->succ.succ_apps.cg_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&cg->succ.succ_apps.cg_mutex);
 
 
-	PTHREAD_MUTEX_DESTROY(&cg->succ.succ_apps.cg_mutex);
-	PTHREAD_COND_DESTROY(&cg->succ.succ_apps.cg_cond);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&cg->succ.succ_apps.cg_mutex);
+	_STARPU_PTHREAD_COND_DESTROY(&cg->succ.succ_apps.cg_cond);
 
 
 	free(cg);
 	free(cg);
 
 

+ 21 - 17
src/core/dependencies/tags.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,9 +23,10 @@
 #include <common/starpu_spinlock.h>
 #include <common/starpu_spinlock.h>
 #include <core/dependencies/cg.h>
 #include <core/dependencies/cg.h>
 
 
-#define STARPU_TAG_SIZE        (sizeof(starpu_tag_t)*8)
+#define _STARPU_TAG_SIZE        (sizeof(starpu_tag_t)*8)
 
 
-typedef enum {
+enum _starpu_tag_state
+{
 	/* this tag is not declared by any task */
 	/* this tag is not declared by any task */
 	STARPU_INVALID_STATE,
 	STARPU_INVALID_STATE,
 	/* _starpu_tag_declare was called to associate the tag to a task */
 	/* _starpu_tag_declare was called to associate the tag to a task */
@@ -40,31 +41,34 @@ typedef enum {
 //	STARPU_SCHEDULED,
 //	STARPU_SCHEDULED,
 	/* the task has been performed */
 	/* the task has been performed */
 	STARPU_DONE
 	STARPU_DONE
-} starpu_tag_state;
+};
 
 
-struct starpu_job_s;
+struct _starpu_job;
 
 
-struct starpu_tag_s {
-	starpu_spinlock_t lock;
+struct _starpu_tag
+{
+	/* Lock for this structure. Locking order is in dependency order: a tag
+	 * must not be locked before locking a tag it depends on */
+	struct _starpu_spinlock lock;
 	starpu_tag_t id; /* an identifier for the task */
 	starpu_tag_t id; /* an identifier for the task */
-	starpu_tag_state state;
+	enum _starpu_tag_state state;
 
 
-	struct starpu_cg_list_s tag_successors;
+	struct _starpu_cg_list tag_successors;
 
 
-	struct starpu_job_s *job; /* which job is associated to the tag if any ? */
+	struct _starpu_job *job; /* which job is associated to the tag if any ? */
 
 
 	unsigned is_assigned;
 	unsigned is_assigned;
 	unsigned is_submitted;
 	unsigned is_submitted;
 };
 };
 
 
-void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
+void _starpu_notify_dependencies(struct _starpu_job *j);
+void _starpu_notify_tag_dependencies(struct _starpu_tag *tag);
 
 
-void _starpu_notify_dependencies(struct starpu_job_s *j);
-void _starpu_notify_tag_dependencies(struct starpu_tag_s *tag);
+void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job);
+void _starpu_tag_set_ready(struct _starpu_tag *tag);
 
 
-void _starpu_tag_declare(starpu_tag_t id, struct starpu_job_s *job);
-void _starpu_tag_set_ready(struct starpu_tag_s *tag);
+unsigned _starpu_submit_job_enforce_task_deps(struct _starpu_job *j);
 
 
-unsigned _starpu_submit_job_enforce_task_deps(struct starpu_job_s *j);
+void _starpu_tag_clear(void);
 
 
 #endif // __TAGS_H__
 #endif // __TAGS_H__

+ 26 - 22
src/core/dependencies/task_deps.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,9 +26,9 @@
 #include <core/dependencies/data_concurrency.h>
 #include <core/dependencies/data_concurrency.h>
 #include <profiling/bound.h>
 #include <profiling/bound.h>
 
 
-static starpu_cg_t *create_cg_task(unsigned ntags, starpu_job_t j)
+static struct _starpu_cg *create_cg_task(unsigned ntags, struct _starpu_job *j)
 {
 {
-	starpu_cg_t *cg = (starpu_cg_t *) malloc(sizeof(starpu_cg_t));
+	struct _starpu_cg *cg = (struct _starpu_cg *) malloc(sizeof(struct _starpu_cg));
 	STARPU_ASSERT(cg);
 	STARPU_ASSERT(cg);
 
 
 	cg->ntags = ntags;
 	cg->ntags = ntags;
@@ -41,55 +41,59 @@ static starpu_cg_t *create_cg_task(unsigned ntags, starpu_job_t j)
 	return cg;
 	return cg;
 }
 }
 
 
-/* the job lock must be taken */
-static void _starpu_task_add_succ(starpu_job_t j, starpu_cg_t *cg)
+static void _starpu_task_add_succ(struct _starpu_job *j, struct _starpu_cg *cg)
 {
 {
 	STARPU_ASSERT(j);
 	STARPU_ASSERT(j);
 
 
-	_starpu_add_successor_to_cg_list(&j->job_successors, cg);
-
-	if (j->terminated) {
+	if (_starpu_add_successor_to_cg_list(&j->job_successors, cg))
 		/* the task was already completed sooner */
 		/* the task was already completed sooner */
 		_starpu_notify_cg(cg);
 		_starpu_notify_cg(cg);
-	}
 }
 }
 
 
-void _starpu_notify_task_dependencies(starpu_job_t j)
+void _starpu_notify_task_dependencies(struct _starpu_job *j)
 {
 {
 	_starpu_notify_cg_list(&j->job_successors);
 	_starpu_notify_cg_list(&j->job_successors);
 }
 }
 
 
 /* task depends on the tasks in task array */
 /* task depends on the tasks in task array */
-void starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[])
+void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[], int check)
 {
 {
 	if (ndeps == 0)
 	if (ndeps == 0)
 		return;
 		return;
 
 
-	starpu_job_t job;
+	struct _starpu_job *job;
 
 
 	job = _starpu_get_job_associated_to_task(task);
 	job = _starpu_get_job_associated_to_task(task);
 
 
-	PTHREAD_MUTEX_LOCK(&job->sync_mutex);
+	if (check)
+		STARPU_ASSERT_MSG(!job->submitted || !task->destroy || task->detach, "Task dependencies have to be set before submission");
+	else
+		STARPU_ASSERT_MSG(job->terminated <= 1, "Task dependencies have to be set before termination");
 
 
-	starpu_cg_t *cg = create_cg_task(ndeps, job);
+	struct _starpu_cg *cg = create_cg_task(ndeps, job);
 
 
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < ndeps; i++)
 	for (i = 0; i < ndeps; i++)
 	{
 	{
 		struct starpu_task *dep_task = task_array[i];
 		struct starpu_task *dep_task = task_array[i];
 
 
-		starpu_job_t dep_job;
+		struct _starpu_job *dep_job;
 		dep_job = _starpu_get_job_associated_to_task(dep_task);
 		dep_job = _starpu_get_job_associated_to_task(dep_task);
-		STARPU_ASSERT(dep_job != job);
 
 
-		STARPU_TRACE_TASK_DEPS(dep_job, job);
+		STARPU_ASSERT_MSG(dep_job != job, "A task must not depend on itself.");
+		if (check)
+			STARPU_ASSERT_MSG(!dep_job->submitted || !dep_job->task->destroy || dep_job->task->detach, "Task dependencies have to be set before submission");
+		else
+			STARPU_ASSERT_MSG(dep_job->terminated <= 1, "Task dependencies have to be set before termination");
+
+		_STARPU_TRACE_TASK_DEPS(dep_job, job);
 		_starpu_bound_task_dep(job, dep_job);
 		_starpu_bound_task_dep(job, dep_job);
 
 
-		PTHREAD_MUTEX_LOCK(&dep_job->sync_mutex);
 		_starpu_task_add_succ(dep_job, cg);
 		_starpu_task_add_succ(dep_job, cg);
-		PTHREAD_MUTEX_UNLOCK(&dep_job->sync_mutex);
 	}
 	}
+}
 
 
-	
-	PTHREAD_MUTEX_UNLOCK(&job->sync_mutex);
+void starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[])
+{
+	_starpu_task_declare_deps_array(task, ndeps, task_array, 1);
 }
 }

+ 6 - 6
src/core/errorcheck.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,9 +18,9 @@
 #include <core/errorcheck.h>
 #include <core/errorcheck.h>
 #include <core/workers.h>
 #include <core/workers.h>
 
 
-void _starpu_set_local_worker_status(starpu_worker_status st)
+void _starpu_set_local_worker_status(enum _starpu_worker_status st)
 {
 {
-	struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
 
 	/* It is possible that we call this function from the application (and
 	/* It is possible that we call this function from the application (and
 	 * thereforce outside a worker), for instance if we are executing the
 	 * thereforce outside a worker), for instance if we are executing the
@@ -29,9 +29,9 @@ void _starpu_set_local_worker_status(starpu_worker_status st)
 		worker->status = st;
 		worker->status = st;
 }
 }
 
 
-starpu_worker_status _starpu_get_local_worker_status(void)
+enum _starpu_worker_status _starpu_get_local_worker_status(void)
 {
 {
-	struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
 	if (STARPU_UNLIKELY(!worker))
 	if (STARPU_UNLIKELY(!worker))
 		return STATUS_INVALID;
 		return STATUS_INVALID;
 
 
@@ -42,7 +42,7 @@ starpu_worker_status _starpu_get_local_worker_status(void)
  * execution of a task. */
  * execution of a task. */
 unsigned _starpu_worker_may_perform_blocking_calls(void)
 unsigned _starpu_worker_may_perform_blocking_calls(void)
 {
 {
-	starpu_worker_status st = _starpu_get_local_worker_status();
+	enum _starpu_worker_status st = _starpu_get_local_worker_status();
 
 
 	return ( !(st == STATUS_CALLBACK) && !(st == STATUS_EXECUTING));
 	return ( !(st == STATUS_CALLBACK) && !(st == STATUS_EXECUTING));
 }
 }

+ 7 - 11
src/core/errorcheck.h

@@ -1,8 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  INRIA
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,7 +21,8 @@
 #include <starpu.h>
 #include <starpu.h>
 
 
 /* This type describes in which state a worker may be. */
 /* This type describes in which state a worker may be. */
-typedef enum {
+enum _starpu_worker_status
+{
 	/* invalid status (for instance if we request the status of some thread
 	/* invalid status (for instance if we request the status of some thread
 	 * that is not controlled by StarPU */
 	 * that is not controlled by StarPU */
 	STATUS_INVALID,
 	STATUS_INVALID,
@@ -35,20 +35,16 @@ typedef enum {
 	/* during the execution of the callback */
 	/* during the execution of the callback */
 	STATUS_CALLBACK,
 	STATUS_CALLBACK,
 	/* while sleeping because there is nothing to do */
 	/* while sleeping because there is nothing to do */
-	STATUS_SLEEPING,
-	/* changing ctx because a new one was create */
-	STATUS_CHANGING_CTX,
-	/* after having done join */
-	STATUS_JOINED
-} starpu_worker_status;
+	STATUS_SLEEPING
+};
 
 
 /* Specify what the local worker is currently doing (eg. executing a callback).
 /* Specify what the local worker is currently doing (eg. executing a callback).
  * This permits to detect if this is legal to do a blocking call for instance.
  * This permits to detect if this is legal to do a blocking call for instance.
  * */
  * */
-void _starpu_set_local_worker_status(starpu_worker_status st);
+void _starpu_set_local_worker_status(enum _starpu_worker_status st);
 
 
 /* Indicate what type of operation the worker is currently doing. */
 /* Indicate what type of operation the worker is currently doing. */
-starpu_worker_status _starpu_get_local_worker_status(void);
+enum _starpu_worker_status _starpu_get_local_worker_status(void);
 
 
 /* It is forbidden to do blocking calls during some operations such as callback
 /* It is forbidden to do blocking calls during some operations such as callback
  * or during the execution of a task. This function indicates whether it is
  * or during the execution of a task. This function indicates whether it is

+ 122 - 111
src/core/jobs.c

@@ -1,8 +1,9 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2011  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,23 +27,28 @@
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <profiling/bound.h>
 #include <starpu_top.h>
 #include <starpu_top.h>
+#include <top/starpu_top_core.h>
 
 
-size_t _starpu_job_get_data_size(starpu_job_t j)
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j)
 {
 {
-	size_t size = 0;
-
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 
 
-	unsigned nbuffers = task->cl->nbuffers;
-
-	unsigned buffer;
-	for (buffer = 0; buffer < nbuffers; buffer++)
-	{
-		starpu_data_handle handle = task->buffers[buffer].handle;
-		size += _starpu_data_get_size(handle);
+	if (model && model->per_arch[arch][nimpl].size_base) {
+		return model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+	} else if (model && model->size_base) {
+		return model->size_base(task, nimpl);
+	} else {
+		unsigned nbuffers = task->cl->nbuffers;
+		size_t size = 0;
+
+		unsigned buffer;
+		for (buffer = 0; buffer < nbuffers; buffer++)
+		{
+			starpu_data_handle_t handle = task->handles[buffer];
+			size += _starpu_data_get_size(handle);
+		}
+		return size;
 	}
 	}
-
-	return size;
 }
 }
 
 
 /* we need to identify each task to generate the DAG. */
 /* we need to identify each task to generate the DAG. */
@@ -50,18 +56,18 @@ static unsigned job_cnt = 0;
 
 
 void _starpu_exclude_task_from_dag(struct starpu_task *task)
 void _starpu_exclude_task_from_dag(struct starpu_task *task)
 {
 {
-	starpu_job_t j = _starpu_get_job_associated_to_task(task);
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 
 
 	j->exclude_from_dag = 1;
 	j->exclude_from_dag = 1;
 }
 }
 
 
-/* create an internal starpu_job_t structure to encapsulate the task */
-starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task)
+/* create an internal struct _starpu_job structure to encapsulate the task */
+struct _starpu_job* __attribute__((malloc)) _starpu_job_create(struct starpu_task *task)
 {
 {
-	starpu_job_t job;
+	struct _starpu_job *job;
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 
 
-	job = starpu_job_new();
+	job = _starpu_job_new();
 
 
 	job->nimpl =0; /* best implementation */
 	job->nimpl =0; /* best implementation */
 	job->task = task;
 	job->task = task;
@@ -71,7 +77,7 @@ starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task
 	job->terminated = 0;
 	job->terminated = 0;
 
 
 #ifndef STARPU_USE_FXT
 #ifndef STARPU_USE_FXT
-	if (_starpu_bound_recording || starpu_top_status_get())
+	if (_starpu_bound_recording || _starpu_top_status_get())
 #endif
 #endif
 		job->job_id = STARPU_ATOMIC_ADD(&job_cnt, 1);
 		job->job_id = STARPU_ATOMIC_ADD(&job_cnt, 1);
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
@@ -84,8 +90,10 @@ starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task
 
 
 	_starpu_cg_list_init(&job->job_successors);
 	_starpu_cg_list_init(&job->job_successors);
 
 
-	PTHREAD_MUTEX_INIT(&job->sync_mutex, NULL);
-	PTHREAD_COND_INIT(&job->sync_cond, NULL);
+	job->implicit_dep_handle = NULL;
+
+	_STARPU_PTHREAD_MUTEX_INIT(&job->sync_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&job->sync_cond, NULL);
 
 
 	job->bound_task = NULL;
 	job->bound_task = NULL;
 
 
@@ -99,29 +107,33 @@ starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task
 	return job;
 	return job;
 }
 }
 
 
-void _starpu_job_destroy(starpu_job_t j)
+void _starpu_job_destroy(struct _starpu_job *j)
 {
 {
-	PTHREAD_COND_DESTROY(&j->sync_cond);
-	PTHREAD_MUTEX_DESTROY(&j->sync_mutex);
+	/* Wait for any code that was still working on the job (and was
+	 * probably our waker) */
+	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+	_STARPU_PTHREAD_COND_DESTROY(&j->sync_cond);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&j->sync_mutex);
 
 
 	if (j->task_size > 1)
 	if (j->task_size > 1)
 	{
 	{
-		PTHREAD_BARRIER_DESTROY(&j->before_work_barrier);
-		PTHREAD_BARRIER_DESTROY(&j->after_work_barrier);
+		_STARPU_PTHREAD_BARRIER_DESTROY(&j->before_work_barrier);
+		_STARPU_PTHREAD_BARRIER_DESTROY(&j->after_work_barrier);
 	}
 	}
 
 
 	_starpu_cg_list_deinit(&j->job_successors);
 	_starpu_cg_list_deinit(&j->job_successors);
 
 
-	starpu_job_delete(j);
+	_starpu_job_delete(j);
 }
 }
 
 
-void _starpu_wait_job(starpu_job_t j)
+void _starpu_wait_job(struct _starpu_job *j)
 {
 {
 	STARPU_ASSERT(j->task);
 	STARPU_ASSERT(j->task);
 	STARPU_ASSERT(!j->task->detach);
 	STARPU_ASSERT(!j->task->detach);
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 
 
-	PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 
 	/* We wait for the flag to have a value of 2 which means that both the
 	/* We wait for the flag to have a value of 2 which means that both the
 	 * codelet's implementation and its callback have been executed. That
 	 * codelet's implementation and its callback have been executed. That
@@ -129,25 +141,20 @@ void _starpu_wait_job(starpu_job_t j)
 	 * executed (so that we cannot destroy the task while it is still being
 	 * executed (so that we cannot destroy the task while it is still being
 	 * manipulated by the driver). */
 	 * manipulated by the driver). */
 	while (j->terminated != 2)
 	while (j->terminated != 2)
-		PTHREAD_COND_WAIT(&j->sync_cond, &j->sync_mutex);
+		_STARPU_PTHREAD_COND_WAIT(&j->sync_cond, &j->sync_mutex);
 
 
-	PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
 }
 }
 
 
-void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_locked, int workerid)
+void _starpu_handle_job_termination(struct _starpu_job *j, int workerid)
 {
 {
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 	unsigned sched_ctx = task->sched_ctx;
 	unsigned sched_ctx = task->sched_ctx;
-	
-	if (!job_is_already_locked)
-		PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 
-	task->status = STARPU_TASK_FINISHED;
+	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 
-	/* in case there are dependencies, wake up the proper tasks */
-	j->submitted = 0;
-	_starpu_notify_dependencies(j);
+	task->status = STARPU_TASK_FINISHED;
 
 
 	/* We must have set the j->terminated flag early, so that it is
 	/* We must have set the j->terminated flag early, so that it is
 	 * possible to express task dependencies within the callback
 	 * possible to express task dependencies within the callback
@@ -155,46 +162,54 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 	 * the callback is not done yet. */
 	 * the callback is not done yet. */
 	j->terminated = 1;
 	j->terminated = 1;
 
 
-	if (!job_is_already_locked)
-		PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+
+	/* Task does not have a cl, but has explicit data dependencies, we need
+	 * to tell them that we will not exist any more before notifying the
+	 * tasks waiting for us */
+	if (j->implicit_dep_handle)
+		_starpu_release_data_enforce_sequential_consistency(j->task, j->implicit_dep_handle);
 
 
-	/* the callback is executed after the dependencies so that we may remove the tag 
+	/* in case there are dependencies, wake up the proper tasks */
+	_starpu_notify_dependencies(j);
+
+	/* the callback is executed after the dependencies so that we may remove the tag
  	 * of the task itself */
  	 * of the task itself */
 	if (task->callback_func)
 	if (task->callback_func)
 	{
 	{
 		int profiling = starpu_profiling_status_get();
 		int profiling = starpu_profiling_status_get();
 		if (profiling && task->profiling_info)
 		if (profiling && task->profiling_info)
-			starpu_clock_gettime(&task->profiling_info->callback_start_time);
+			_starpu_clock_gettime(&task->profiling_info->callback_start_time);
 
 
 		/* so that we can check whether we are doing blocking calls
 		/* so that we can check whether we are doing blocking calls
 		 * within the callback */
 		 * within the callback */
 		_starpu_set_local_worker_status(STATUS_CALLBACK);
 		_starpu_set_local_worker_status(STATUS_CALLBACK);
-		
-		
+
+
 		/* Perhaps we have nested callbacks (eg. with chains of empty
 		/* Perhaps we have nested callbacks (eg. with chains of empty
 		 * tasks). So we store the current task and we will restore it
 		 * tasks). So we store the current task and we will restore it
 		 * later. */
 		 * later. */
-		struct starpu_task *current_task = starpu_get_current_task();
+		struct starpu_task *current_task = starpu_task_get_current();
 
 
 		_starpu_set_current_task(task);
 		_starpu_set_current_task(task);
 
 
-		STARPU_TRACE_START_CALLBACK(j);
+		_STARPU_TRACE_START_CALLBACK(j);
 		task->callback_func(task->callback_arg);
 		task->callback_func(task->callback_arg);
-		STARPU_TRACE_END_CALLBACK(j);
-		
+		_STARPU_TRACE_END_CALLBACK(j);
+
 		_starpu_set_current_task(current_task);
 		_starpu_set_current_task(current_task);
 
 
 		_starpu_set_local_worker_status(STATUS_UNKNOWN);
 		_starpu_set_local_worker_status(STATUS_UNKNOWN);
 
 
 		if (profiling && task->profiling_info)
 		if (profiling && task->profiling_info)
-			starpu_clock_gettime(&task->profiling_info->callback_end_time);
+			_starpu_clock_gettime(&task->profiling_info->callback_end_time);
 	}
 	}
 
 
 	/* control task should not execute post_exec_hook */
 	/* control task should not execute post_exec_hook */
 	if(task->cl != NULL && !task->control_task)
 	if(task->cl != NULL && !task->control_task)
 	  _starpu_sched_post_exec_hook(task);
 	  _starpu_sched_post_exec_hook(task);
 
 
-	STARPU_TRACE_TASK_DONE(j);
+	_STARPU_TRACE_TASK_DONE(j);
 
 
 	/* NB: we do not save those values before the callback, in case the
 	/* NB: we do not save those values before the callback, in case the
 	 * application changes some parameters eventually (eg. a task may not
 	 * application changes some parameters eventually (eg. a task may not
@@ -203,27 +218,24 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 	int detach = task->detach;
 	int detach = task->detach;
 	int regenerate = task->regenerate;
 	int regenerate = task->regenerate;
 
 
-	if (!detach)
+	/* we do not desallocate the job structure if some is going to
+	 * wait after the task */
+	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+	/* A value of 2 is put to specify that not only the codelet but
+	 * also the callback were executed. */
+	j->terminated = 2;
+	_STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+
+	if (detach)
 	{
 	{
-		/* we do not desallocate the job structure if some is going to
-		 * wait after the task */
-		if (!job_is_already_locked)
-			PTHREAD_MUTEX_LOCK(&j->sync_mutex);
-		/* A value of 2 is put to specify that not only the codelet but
-		 * also the callback were executed. */
-		j->terminated = 2;
-		PTHREAD_COND_BROADCAST(&j->sync_cond);
-
-		if (!job_is_already_locked)
-			PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
-	}
-	else {
 		/* no one is going to synchronize with that task so we release
 		/* no one is going to synchronize with that task so we release
 		 * the data structures now. In case the job was already locked
 		 * the data structures now. In case the job was already locked
 		 * by the caller, it is its responsability to destroy the task.
 		 * by the caller, it is its responsability to destroy the task.
 		 * */
 		 * */
-		if (!job_is_already_locked && destroy)
-			starpu_task_destroy(task);
+		if (destroy)
+			_starpu_task_destroy(task);
 	}
 	}
 
 
 	if (regenerate)
 	if (regenerate)
@@ -233,10 +245,9 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 		/* We reuse the same job structure */
 		/* We reuse the same job structure */
 		int ret = _starpu_submit_job(j, 1);
 		int ret = _starpu_submit_job(j, 1);
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
-	}	
-	else {
-		_starpu_decrement_nsubmitted_tasks();
 	}
 	}
+	_starpu_decrement_nsubmitted_tasks();
+	_starpu_decrement_nready_tasks();
 
 
 	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
 	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
 
 
@@ -244,9 +255,9 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 		_starpu_decrement_nsubmitted_tasks_of_worker(workerid);
 		_starpu_decrement_nsubmitted_tasks_of_worker(workerid);
 }
 }
 
 
-/* This function is called when a new task is submitted to StarPU 
+/* This function is called when a new task is submitted to StarPU
  * it returns 1 if the tag deps are not fulfilled, 0 otherwise */
  * it returns 1 if the tag deps are not fulfilled, 0 otherwise */
-static unsigned _starpu_not_all_tag_deps_are_fulfilled(starpu_job_t j)
+static unsigned _starpu_not_all_tag_deps_are_fulfilled(struct _starpu_job *j)
 {
 {
 	unsigned ret;
 	unsigned ret;
 
 
@@ -256,9 +267,9 @@ static unsigned _starpu_not_all_tag_deps_are_fulfilled(starpu_job_t j)
 		return 0;
 		return 0;
 	}
 	}
 
 
-	struct starpu_tag_s *tag = j->tag;
+	struct _starpu_tag *tag = j->tag;
 
 
-	struct starpu_cg_list_s *tag_successors = &tag->tag_successors;
+	struct _starpu_cg_list *tag_successors = &tag->tag_successors;
 
 
 	_starpu_spin_lock(&tag->lock);
 	_starpu_spin_lock(&tag->lock);
 
 
@@ -268,7 +279,8 @@ static unsigned _starpu_not_all_tag_deps_are_fulfilled(starpu_job_t j)
                 j->task->status = STARPU_TASK_BLOCKED_ON_TAG;
                 j->task->status = STARPU_TASK_BLOCKED_ON_TAG;
 		ret = 1;
 		ret = 1;
 	}
 	}
-	else {
+	else
+	{
 		/* existing deps (if any) are fulfilled */
 		/* existing deps (if any) are fulfilled */
 		tag->state = STARPU_READY;
 		tag->state = STARPU_READY;
 		/* already prepare for next run */
 		/* already prepare for next run */
@@ -280,97 +292,95 @@ static unsigned _starpu_not_all_tag_deps_are_fulfilled(starpu_job_t j)
 	return ret;
 	return ret;
 }
 }
 
 
-#ifdef STARPU_DEVEL
-#warning TODO remove the job_is_already_locked parameter
-#endif
-static unsigned _starpu_not_all_task_deps_are_fulfilled(starpu_job_t j, unsigned job_is_already_locked)
+static unsigned _starpu_not_all_task_deps_are_fulfilled(struct _starpu_job *j)
 {
 {
 	unsigned ret;
 	unsigned ret;
 
 
-	struct starpu_cg_list_s *job_successors = &j->job_successors;
-
-	if (!job_is_already_locked)
-		PTHREAD_MUTEX_LOCK(&j->sync_mutex);	
+	struct _starpu_cg_list *job_successors = &j->job_successors;
 
 
 	if (!j->submitted || (job_successors->ndeps != job_successors->ndeps_completed))
 	if (!j->submitted || (job_successors->ndeps != job_successors->ndeps_completed))
 	{
 	{
                 j->task->status = STARPU_TASK_BLOCKED_ON_TASK;
                 j->task->status = STARPU_TASK_BLOCKED_ON_TASK;
 		ret = 1;
 		ret = 1;
 	}
 	}
-	else {
+	else
+	{
 		/* existing deps (if any) are fulfilled */
 		/* existing deps (if any) are fulfilled */
 		/* already prepare for next run */
 		/* already prepare for next run */
 		job_successors->ndeps_completed = 0;
 		job_successors->ndeps_completed = 0;
 		ret = 0;
 		ret = 0;
 	}
 	}
 
 
-	if (!job_is_already_locked)
-		PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
-
 	return ret;
 	return ret;
 }
 }
 
 
-
-
 /*
 /*
  *	In order, we enforce tag, task and data dependencies. The task is
  *	In order, we enforce tag, task and data dependencies. The task is
  *	passed to the scheduler only once all these constraints are fulfilled.
  *	passed to the scheduler only once all these constraints are fulfilled.
+ *
+ *	The job mutex has to be taken for atomicity with task submission, and
+ *	is released here.
  */
  */
-#ifdef STARPU_DEVEL
-#warning TODO remove the job_is_already_locked parameter
-#endif
-unsigned _starpu_enforce_deps_and_schedule(starpu_job_t j, unsigned job_is_already_locked)
+unsigned _starpu_enforce_deps_and_schedule(struct _starpu_job *j)
 {
 {
 	unsigned ret;
 	unsigned ret;
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 
 
 	/* enfore tag dependencies */
 	/* enfore tag dependencies */
-	if (_starpu_not_all_tag_deps_are_fulfilled(j)) {
+	if (_starpu_not_all_tag_deps_are_fulfilled(j))
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
                 _STARPU_LOG_OUT_TAG("not_all_tag_deps_are_fulfilled");
                 _STARPU_LOG_OUT_TAG("not_all_tag_deps_are_fulfilled");
 		return 0;
 		return 0;
         }
         }
 
 
 	/* enfore task dependencies */
 	/* enfore task dependencies */
-	if (_starpu_not_all_task_deps_are_fulfilled(j, job_is_already_locked)) {
+	if (_starpu_not_all_task_deps_are_fulfilled(j))
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
                 _STARPU_LOG_OUT_TAG("not_all_task_deps_are_fulfilled");
                 _STARPU_LOG_OUT_TAG("not_all_task_deps_are_fulfilled");
 		return 0;
 		return 0;
         }
         }
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
 
 	/* enforce data dependencies */
 	/* enforce data dependencies */
-	if (_starpu_submit_job_enforce_data_deps(j)) {
+	if (_starpu_submit_job_enforce_data_deps(j))
+	{
                 _STARPU_LOG_OUT_TAG("enforce_data_deps");
                 _STARPU_LOG_OUT_TAG("enforce_data_deps");
 		return 0;
 		return 0;
         }
         }
 
 
-	ret = _starpu_push_task(j, job_is_already_locked);
+	ret = _starpu_push_task(j);
 
 
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
 	return ret;
 	return ret;
 }
 }
 
 
 /* Tag deps are already fulfilled */
 /* Tag deps are already fulfilled */
-#ifdef STARPU_DEVEL
-#warning TODO remove the job_is_already_locked parameter
-#endif
-unsigned _starpu_enforce_deps_starting_from_task(starpu_job_t j, unsigned job_is_already_locked)
+unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j)
 {
 {
 	unsigned ret;
 	unsigned ret;
 
 
+	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	/* enfore task dependencies */
 	/* enfore task dependencies */
-	if (_starpu_not_all_task_deps_are_fulfilled(j, job_is_already_locked))
+	if (_starpu_not_all_task_deps_are_fulfilled(j))
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 		return 0;
 		return 0;
+	}
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
 
 	/* enforce data dependencies */
 	/* enforce data dependencies */
 	if (_starpu_submit_job_enforce_data_deps(j))
 	if (_starpu_submit_job_enforce_data_deps(j))
 		return 0;
 		return 0;
 
 
-	ret = _starpu_push_task(j, job_is_already_locked);
+	ret = _starpu_push_task(j);
 
 
 	return ret;
 	return ret;
 }
 }
 
 
 /* This function must be called with worker->sched_mutex taken */
 /* This function must be called with worker->sched_mutex taken */
-struct starpu_task *_starpu_pop_local_task(struct starpu_worker_s *worker)
+struct starpu_task *_starpu_pop_local_task(struct _starpu_worker *worker)
 {
 {
 	struct starpu_task *task = NULL;
 	struct starpu_task *task = NULL;
 
 
@@ -380,27 +390,27 @@ struct starpu_task *_starpu_pop_local_task(struct starpu_worker_s *worker)
 	return task;
 	return task;
 }
 }
 
 
-int _starpu_push_local_task(struct starpu_worker_s *worker, struct starpu_task *task, int back)
+int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *task, int back)
 {
 {
 	/* Check that the worker is able to execute the task ! */
 	/* Check that the worker is able to execute the task ! */
 	STARPU_ASSERT(task && task->cl);
 	STARPU_ASSERT(task && task->cl);
 	if (STARPU_UNLIKELY(!(worker->worker_mask & task->cl->where)))
 	if (STARPU_UNLIKELY(!(worker->worker_mask & task->cl->where)))
 		return -ENODEV;
 		return -ENODEV;
 
 
-	PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(worker->sched_mutex);
 
 
 	if (back)
 	if (back)
 		starpu_task_list_push_back(&worker->local_tasks, task);
 		starpu_task_list_push_back(&worker->local_tasks, task);
 	else
 	else
 		starpu_task_list_push_front(&worker->local_tasks, task);
 		starpu_task_list_push_front(&worker->local_tasks, task);
 
 
-	PTHREAD_COND_BROADCAST(&worker->sched_cond);
-	PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+	_STARPU_PTHREAD_COND_BROADCAST(worker->sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(worker->sched_mutex);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
-const char *_starpu_get_model_name(starpu_job_t j)
+const char *_starpu_get_model_name(struct _starpu_job *j)
 {
 {
 	if (!j)
 	if (!j)
 		return NULL;
 		return NULL;
@@ -410,7 +420,8 @@ const char *_starpu_get_model_name(starpu_job_t j)
             && task->cl->model
             && task->cl->model
             && task->cl->model->symbol)
             && task->cl->model->symbol)
                 return task->cl->model->symbol;
                 return task->cl->model->symbol;
-        else {
+        else
+	{
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
                 return j->model_name;
                 return j->model_name;
 #else
 #else

+ 31 - 26
src/core/jobs.h

@@ -1,8 +1,9 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2011  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,20 +43,19 @@
 #include <cuda.h>
 #include <cuda.h>
 #endif
 #endif
 
 
-struct starpu_worker_s;
+struct _starpu_worker;
 
 
 /* codelet function */
 /* codelet function */
-typedef void (*cl_func)(void **, void *);
-typedef void (*callback)(void *);
+typedef void (*_starpu_cl_func_t)(void **, void *);
 
 
-#define STARPU_CPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_CPU)
-#define STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
-#define STARPU_SPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SPU)
-#define STARPU_GORDON_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_GORDON)
-#define STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
+#define _STARPU_CPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_CPU)
+#define _STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
+#define _STARPU_SPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SPU)
+#define _STARPU_GORDON_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_GORDON)
+#define _STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
 
 
 /* A job is the internal representation of a task. */
 /* A job is the internal representation of a task. */
-LIST_TYPE(starpu_job,
+LIST_TYPE(_starpu_job,
 
 
 	/* The implementation associated to the job */
 	/* The implementation associated to the job */
 	unsigned nimpl;
 	unsigned nimpl;
@@ -71,15 +71,20 @@ LIST_TYPE(starpu_job,
 	/* To avoid deadlocks, we reorder the different buffers accessed to by
 	/* To avoid deadlocks, we reorder the different buffers accessed to by
 	 * the task so that we always grab the rw-lock associated to the
 	 * the task so that we always grab the rw-lock associated to the
 	 * handles in the same order. */
 	 * handles in the same order. */
-	struct starpu_buffer_descr_t ordered_buffers[STARPU_NMAXBUFS];
-	
+	struct starpu_buffer_descr ordered_buffers[STARPU_NMAXBUFS];
+
 	/* If a tag is associated to the job, this points to the internal data
 	/* If a tag is associated to the job, this points to the internal data
 	 * structure that describes the tag status. */
 	 * structure that describes the tag status. */
-	struct starpu_tag_s *tag;
+	struct _starpu_tag *tag;
 
 
 	/* Maintain a list of all the completion groups that depend on the job.
 	/* Maintain a list of all the completion groups that depend on the job.
 	 * */
 	 * */
-	struct starpu_cg_list_s job_successors;
+	struct _starpu_cg_list job_successors;
+
+	/* For tasks with cl==NULL but submitted with explicit data dependency,
+	 * the handle for this dependency, so as to remove the task from the
+	 * last_writer/readers */
+	starpu_data_handle_t implicit_dep_handle;
 
 
 	/* The value of the footprint that identifies the job may be stored in
 	/* The value of the footprint that identifies the job may be stored in
 	 * this structure. */
 	 * this structure. */
@@ -128,43 +133,43 @@ LIST_TYPE(starpu_job,
 	/* Parallel workers may have to synchronize before/after the execution of a parallel task. */
 	/* Parallel workers may have to synchronize before/after the execution of a parallel task. */
 	pthread_barrier_t before_work_barrier;
 	pthread_barrier_t before_work_barrier;
 	pthread_barrier_t after_work_barrier;
 	pthread_barrier_t after_work_barrier;
-);
+)
 
 
-/* Create an internal starpu_job_t structure to encapsulate the task. */
-starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task);
+/* Create an internal struct _starpu_job *structure to encapsulate the task. */
+struct _starpu_job* __attribute__((malloc)) _starpu_job_create(struct starpu_task *task);
 
 
 /* Destroy the data structure associated to the job structure */
 /* Destroy the data structure associated to the job structure */
-void _starpu_job_destroy(starpu_job_t j);
+void _starpu_job_destroy(struct _starpu_job *j);
 
 
 /* Wait for the termination of the job */
 /* Wait for the termination of the job */
-void _starpu_wait_job(starpu_job_t j);
+void _starpu_wait_job(struct _starpu_job *j);
 
 
 /* Specify that the task should not appear in the DAG generated by debug tools. */
 /* Specify that the task should not appear in the DAG generated by debug tools. */
 void _starpu_exclude_task_from_dag(struct starpu_task *task);
 void _starpu_exclude_task_from_dag(struct starpu_task *task);
 
 
 /* try to submit job j, enqueue it if it's not schedulable yet */
 /* try to submit job j, enqueue it if it's not schedulable yet */
-unsigned _starpu_enforce_deps_and_schedule(starpu_job_t j, unsigned job_is_already_locked);
-unsigned _starpu_enforce_deps_starting_from_task(starpu_job_t j, unsigned job_is_already_locked);
+unsigned _starpu_enforce_deps_and_schedule(struct _starpu_job *j);
+unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j);
 
 
 
 
 /* This function must be called after the execution of a job, this triggers all
 /* This function must be called after the execution of a job, this triggers all
  * job's dependencies and perform the callback function if any. */
  * job's dependencies and perform the callback function if any. */
-void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_locked, int workerid);
+void _starpu_handle_job_termination(struct _starpu_job *j, int workerid);
 
 
 /* Get the sum of the size of the data accessed by the job. */
 /* Get the sum of the size of the data accessed by the job. */
-size_t _starpu_job_get_data_size(starpu_job_t j);
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, struct _starpu_job *j);
 
 
 /* Get a task from the local pool of tasks that were explicitly attributed to
 /* Get a task from the local pool of tasks that were explicitly attributed to
  * that worker. */
  * that worker. */
-struct starpu_task *_starpu_pop_local_task(struct starpu_worker_s *worker);
+struct starpu_task *_starpu_pop_local_task(struct _starpu_worker *worker);
 
 
 /* Put a task into the pool of tasks that are explicitly attributed to the
 /* Put a task into the pool of tasks that are explicitly attributed to the
  * specified worker. If "back" is set, the task is put at the back of the list.
  * specified worker. If "back" is set, the task is put at the back of the list.
  * Considering the tasks are popped from the back, this value should be 0 to
  * Considering the tasks are popped from the back, this value should be 0 to
  * enforce a FIFO ordering. */
  * enforce a FIFO ordering. */
-int _starpu_push_local_task(struct starpu_worker_s *worker, struct starpu_task *task, int back);
+int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *task, int back);
 
 
 /* Returns the symbol associated to that job if any. */
 /* Returns the symbol associated to that job if any. */
-const char *_starpu_get_model_name(starpu_job_t j);
+const char *_starpu_get_model_name(struct _starpu_job *j);
 
 
 #endif // __JOBS_H__
 #endif // __JOBS_H__

+ 298 - 55
src/core/perfmodel/perfmodel.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -30,7 +30,7 @@
 #ifdef STARPU_HAVE_WINDOWS
 #ifdef STARPU_HAVE_WINDOWS
 #include <windows.h>
 #include <windows.h>
 #endif
 #endif
-		
+
 /* This flag indicates whether performance models should be calibrated or not.
 /* This flag indicates whether performance models should be calibrated or not.
  *	0: models need not be calibrated
  *	0: models need not be calibrated
  *	1: models must be calibrated
  *	1: models must be calibrated
@@ -50,7 +50,7 @@ unsigned _starpu_get_calibrate_flag(void)
 
 
 enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid)
 enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid)
 {
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
 	/* This workerid may either be a basic worker or a combined worker */
 	/* This workerid may either be a basic worker or a combined worker */
 	unsigned nworkers = config->topology.nworkers;
 	unsigned nworkers = config->topology.nworkers;
@@ -68,14 +68,18 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid)
  * PER ARCH model
  * PER ARCH model
  */
  */
 
 
-static double per_arch_task_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_task *task, unsigned nimpl)
+static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct starpu_task *task, unsigned nimpl)
 {
 {
-	double exp = -1.0;
-	double (*per_arch_cost_model)(struct starpu_buffer_descr_t *);
-	
+	double exp = NAN;
+	double (*per_arch_cost_function)(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
+	double (*per_arch_cost_model)(struct starpu_buffer_descr *);
+
+	per_arch_cost_function = model->per_arch[arch][nimpl].cost_function;
 	per_arch_cost_model = model->per_arch[arch][nimpl].cost_model;
 	per_arch_cost_model = model->per_arch[arch][nimpl].cost_model;
 
 
-	if (per_arch_cost_model)
+	if (per_arch_cost_function)
+		exp = per_arch_cost_function(task, arch, nimpl);
+	else if (per_arch_cost_model)
 		exp = per_arch_cost_model(task->buffers);
 		exp = per_arch_cost_model(task->buffers);
 
 
 	return exp;
 	return exp;
@@ -89,50 +93,67 @@ double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtyp
 {
 {
 	if (perf_archtype < STARPU_CUDA_DEFAULT)
 	if (perf_archtype < STARPU_CUDA_DEFAULT)
 	{
 	{
-		return STARPU_CPU_ALPHA * (perf_archtype + 1);
+		return _STARPU_CPU_ALPHA * (perf_archtype + 1);
 	}
 	}
 	else if (perf_archtype < STARPU_OPENCL_DEFAULT)
 	else if (perf_archtype < STARPU_OPENCL_DEFAULT)
 	{
 	{
-		return STARPU_CUDA_ALPHA;
+		return _STARPU_CUDA_ALPHA;
 	}
 	}
 	else if (perf_archtype < STARPU_GORDON_DEFAULT)
 	else if (perf_archtype < STARPU_GORDON_DEFAULT)
 	{
 	{
-		return STARPU_OPENCL_ALPHA;
+		return _STARPU_OPENCL_ALPHA;
 	}
 	}
-	else if (perf_archtype < STARPU_NARCH_VARIATIONS) {
+	else if (perf_archtype < STARPU_NARCH_VARIATIONS)
+	{
 		/* Gordon value */
 		/* Gordon value */
-		return STARPU_GORDON_ALPHA;
+		return _STARPU_GORDON_ALPHA;
 	}
 	}
 
 
 	STARPU_ABORT();
 	STARPU_ABORT();
 
 
 	/* Never reached ! */
 	/* Never reached ! */
-	return -1.0;
+	return NAN;
 }
 }
 
 
-static double common_task_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_task *task)
+static double common_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct starpu_task *task, unsigned nimpl)
 {
 {
 	double exp;
 	double exp;
 	double alpha;
 	double alpha;
 
 
-	if (model->cost_model) {
+	if (model->cost_function)
+	{
+		exp = model->cost_function(task, nimpl);
+		alpha = starpu_worker_get_relative_speedup(arch);
+
+		STARPU_ASSERT(!_STARPU_IS_ZERO(alpha));
+
+		return (exp/alpha);
+	}
+	else if (model->cost_model)
+	{
 		exp = model->cost_model(task->buffers);
 		exp = model->cost_model(task->buffers);
 		alpha = starpu_worker_get_relative_speedup(arch);
 		alpha = starpu_worker_get_relative_speedup(arch);
 
 
-		STARPU_ASSERT(alpha != 0.0f);
+		STARPU_ASSERT(!_STARPU_IS_ZERO(alpha));
 
 
 		return (exp/alpha);
 		return (exp/alpha);
 	}
 	}
 
 
-	return -1.0;
+	return NAN;
 }
 }
 
 
-void _starpu_load_perfmodel(struct starpu_perfmodel_t *model)
+void _starpu_load_perfmodel(struct starpu_perfmodel *model)
 {
 {
 	if (!model || model->is_loaded)
 	if (!model || model->is_loaded)
 		return;
 		return;
 
 
-	switch (model->type) {
+	int load_model = _starpu_register_model(model);
+
+	if (!load_model)
+		return;
+
+	switch (model->type)
+	{
 		case STARPU_PER_ARCH:
 		case STARPU_PER_ARCH:
 		case STARPU_COMMON:
 		case STARPU_COMMON:
 			break;
 			break;
@@ -150,20 +171,21 @@ void _starpu_load_perfmodel(struct starpu_perfmodel_t *model)
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}
 
 
-	_starpu_register_model(model);
 	model->is_loaded = 1;
 	model->is_loaded = 1;
 }
 }
 
 
-static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch,  unsigned nimpl)
+static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, enum starpu_perf_archtype arch,  unsigned nimpl)
 {
 {
-	if (model) {
-		starpu_job_t j = _starpu_get_job_associated_to_task(task);
-		switch (model->type) {
+	if (model)
+	{
+		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+		switch (model->type)
+		{
 			case STARPU_PER_ARCH:
 			case STARPU_PER_ARCH:
 
 
 				return per_arch_task_expected_perf(model, arch, task, nimpl);
 				return per_arch_task_expected_perf(model, arch, task, nimpl);
 			case STARPU_COMMON:
 			case STARPU_COMMON:
-				return common_task_expected_perf(model, arch, task);
+				return common_task_expected_perf(model, arch, task, nimpl);
 
 
 			case STARPU_HISTORY_BASED:
 			case STARPU_HISTORY_BASED:
 
 
@@ -178,7 +200,7 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
 
 			default:
 			default:
 				STARPU_ABORT();
 				STARPU_ABORT();
-		};
+		}
 	}
 	}
 
 
 	/* no model was found */
 	/* no model was found */
@@ -196,13 +218,89 @@ double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_arc
 	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
 	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
 }
 }
 
 
+double starpu_task_expected_conversion_time(struct starpu_task *task,
+					    enum starpu_perf_archtype arch,
+					    unsigned nimpl)
+{
+	unsigned i;
+	int err;
+	double sum = 0.0;
+	unsigned int node, cpu_node;
+
+	/* We need to get one node per archtype. This is kinda ugly,
+	 * but it does the job.
+	 * XXX : Should we return 0 if there are no devices ?
+	 * (err != 1 && err != -ERANGE)
+	 */
+#ifdef STARPU_USE_CPU
+	int cpu_worker;
+	err = starpu_worker_get_ids_by_type(STARPU_CPU_WORKER,
+					    &cpu_worker, 1);
+	if (err != 1 && err != -ERANGE)
+		return 0.0;
+	cpu_node = starpu_worker_get_memory_node(cpu_worker);
+#endif
+#ifdef STARPU_USE_CUDA
+	int cuda_worker, cuda_node;
+	err = starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER,
+					    &cuda_worker, 1);
+	if (err != 1 && err != -ERANGE)
+		return 0.0;
+	cuda_node = starpu_worker_get_memory_node(cuda_worker);
+#endif
+#ifdef STARPU_USE_OPENCL
+	int opencl_worker, opencl_node;
+	err = starpu_worker_get_ids_by_type(STARPU_OPENCL_WORKER,
+					    &opencl_worker, 1);
+	if (err != 1 && err != -ERANGE)
+		return 0.0;
+
+	opencl_node = starpu_worker_get_memory_node(opencl_worker);
+#endif
+
+	for (i = 0; i < task->cl->nbuffers; i++)
+	{
+		starpu_data_handle_t handle;
+		struct starpu_task *conversion_task;
+
+		handle = task->handles[i];
+		if (!_starpu_data_is_multiformat_handle(handle))
+			continue;
+
+		if (arch < STARPU_CUDA_DEFAULT)
+			node = cpu_node;
+#ifdef STARPU_USE_CUDA
+		else if (arch >= STARPU_CUDA_DEFAULT && arch < STARPU_OPENCL_DEFAULT)
+			node = cuda_node;
+#endif
+#ifdef STARPU_USE_OPENCL
+		else if (arch >= STARPU_OPENCL_DEFAULT && arch < STARPU_GORDON_DEFAULT)
+			node = opencl_node;
+#endif
+		else
+			STARPU_ASSERT(0);
+
+		if (!_starpu_handle_needs_conversion_task(handle, node))
+			continue;
+
+		conversion_task = _starpu_create_conversion_task(handle, node);
+		sum += starpu_task_expected_length(conversion_task, arch, nimpl);
+		handle->refcnt--;
+		handle->busy_count--;
+		starpu_task_deinit(conversion_task);
+		free(conversion_task);
+	}
+
+	return sum;
+}
+
 /* Predict the transfer time (in µs) to move a handle to a memory node */
 /* Predict the transfer time (in µs) to move a handle to a memory node */
-double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode)
+double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_access_mode mode)
 {
 {
 	/* If we don't need to read the content of the handle */
 	/* If we don't need to read the content of the handle */
 	if (!(mode & STARPU_R))
 	if (!(mode & STARPU_R))
 		return 0.0;
 		return 0.0;
-	
+
 	if (_starpu_is_data_present_or_requested(handle, memory_node))
 	if (_starpu_is_data_present_or_requested(handle, memory_node))
 		return 0.0;
 		return 0.0;
 
 
@@ -215,7 +313,7 @@ double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned me
 	if (size == 0)
 	if (size == 0)
 		return 0.0;
 		return 0.0;
 
 
-	uint32_t src_node = _starpu_select_src_node(handle);
+	uint32_t src_node = _starpu_select_src_node(handle, memory_node);
 	return _starpu_predict_transfer_time(src_node, memory_node, size);
 	return _starpu_predict_transfer_time(src_node, memory_node, size);
 }
 }
 
 
@@ -229,8 +327,8 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 
 
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	for (buffer = 0; buffer < nbuffers; buffer++)
 	{
 	{
-		starpu_data_handle handle = task->buffers[buffer].handle;
-		starpu_access_mode mode = task->buffers[buffer].mode;
+		starpu_data_handle_t handle = task->handles[buffer];
+		enum starpu_access_mode mode = task->cl->modes[buffer];
 
 
 		penalty += starpu_data_expected_transfer_time(handle, memory_node, mode);
 		penalty += starpu_data_expected_transfer_time(handle, memory_node, mode);
 	}
 	}
@@ -238,6 +336,119 @@ double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct star
 	return penalty;
 	return penalty;
 }
 }
 
 
+/* Return the expected duration of the entire task bundle in µs */
+double _starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	double expected_length = 0.0;
+
+	/* We expect the length of the bundle the be the sum of the different tasks length. */
+	_STARPU_PTHREAD_MUTEX_LOCK(&bundle->mutex);
+
+	struct _starpu_task_bundle_entry *entry;
+	entry = bundle->list;
+
+	while (entry)
+	{
+		double task_length = starpu_task_expected_length(entry->task, arch, nimpl);
+
+		/* In case the task is not calibrated, we consider the task
+		 * ends immediately. */
+		if (task_length > 0.0)
+			expected_length += task_length;
+
+		entry = entry->next;
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+
+	return expected_length;
+}
+
+/* Return the expected power consumption of the entire task bundle in J */
+double _starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	double expected_power = 0.0;
+
+	/* We expect total consumption of the bundle the be the sum of the different tasks consumption. */
+	_STARPU_PTHREAD_MUTEX_LOCK(&bundle->mutex);
+
+	struct _starpu_task_bundle_entry *entry;
+	entry = bundle->list;
+
+	while (entry)
+	{
+		double task_power = starpu_task_expected_power(entry->task, arch, nimpl);
+
+		/* In case the task is not calibrated, we consider the task
+		 * ends immediately. */
+		if (task_power > 0.0)
+			expected_power += task_power;
+
+		entry = entry->next;
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+
+	return expected_power;
+}
+
+/* Return the time (in µs) expected to transfer all data used within the bundle */
+double _starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node)
+{
+	_STARPU_PTHREAD_MUTEX_LOCK(&bundle->mutex);
+
+	struct _starpu_handle_list *handles = NULL;
+
+	/* We list all the handle that are accessed within the bundle. */
+
+	/* For each task in the bundle */
+	struct _starpu_task_bundle_entry *entry = bundle->list;
+	while (entry)
+	{
+		struct starpu_task *task = entry->task;
+
+		if (task->cl)
+		{
+			unsigned b;
+			for (b = 0; b < task->cl->nbuffers; b++)
+			{
+				starpu_data_handle_t handle = task->handles[b];
+				enum starpu_access_mode mode = task->cl->modes[b];
+
+				if (!(mode & STARPU_R))
+					continue;
+
+				/* Insert the handle in the sorted list in case
+				 * it's not already in that list. */
+				_insertion_handle_sorted(&handles, handle, mode);
+			}
+		}
+
+		entry = entry->next;
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+
+	/* Compute the sum of data transfer time, and destroy the list */
+
+	double total_exp = 0.0;
+
+	while (handles)
+	{
+		struct _starpu_handle_list *current = handles;
+		handles = handles->next;
+
+		double exp;
+		exp = starpu_data_expected_transfer_time(current->handle, memory_node, current->mode);
+
+		total_exp += exp;
+
+		free(current);
+	}
+
+	return total_exp;
+}
+
 static int directory_existence_was_tested = 0;
 static int directory_existence_was_tested = 0;
 
 
 void _starpu_get_perf_model_dir(char *path, size_t maxlen)
 void _starpu_get_perf_model_dir(char *path, size_t maxlen)
@@ -246,13 +457,15 @@ void _starpu_get_perf_model_dir(char *path, size_t maxlen)
 	/* use the directory specified at configure time */
 	/* use the directory specified at configure time */
 	snprintf(path, maxlen, "%s", STARPU_PERF_MODEL_DIR);
 	snprintf(path, maxlen, "%s", STARPU_PERF_MODEL_DIR);
 #else
 #else
-	/* by default, we use $HOME/.starpu/sampling */
-	const char *home_path = getenv("HOME");
+	const char *home_path = getenv("XDG_CACHE_HOME");
+	if (!home_path)
+		home_path = getenv("STARPU_HOME");
+	if (!home_path)
+		home_path = getenv("HOME");
 	if (!home_path)
 	if (!home_path)
 		home_path = getenv("USERPROFILE");
 		home_path = getenv("USERPROFILE");
-	if (!home_path) {
+	if (!home_path)
 		_STARPU_ERROR("couldn't find a home place to put starpu data\n");
 		_STARPU_ERROR("couldn't find a home place to put starpu data\n");
-	}
 	snprintf(path, maxlen, "%s/.starpu/sampling/", home_path);
 	snprintf(path, maxlen, "%s/.starpu/sampling/", home_path);
 #endif
 #endif
 }
 }
@@ -285,8 +498,8 @@ void _starpu_create_sampling_directory_if_needed(void)
 		/* The performance of the codelets are stored in
 		/* The performance of the codelets are stored in
 		 * $STARPU_PERF_MODEL_DIR/codelets/ while those of the bus are stored in
 		 * $STARPU_PERF_MODEL_DIR/codelets/ while those of the bus are stored in
 		 * $STARPU_PERF_MODEL_DIR/bus/ so that we don't have name collisions */
 		 * $STARPU_PERF_MODEL_DIR/bus/ so that we don't have name collisions */
-		
-		/* Testing if a directory exists and creating it otherwise 
+
+		/* Testing if a directory exists and creating it otherwise
 		   may not be safe: it is possible that the permission are
 		   may not be safe: it is possible that the permission are
 		   changed in between. Instead, we create it and check if
 		   changed in between. Instead, we create it and check if
 		   it already existed before */
 		   it already existed before */
@@ -295,14 +508,21 @@ void _starpu_create_sampling_directory_if_needed(void)
 
 
 		if (ret == -1)
 		if (ret == -1)
 		{
 		{
-			STARPU_ASSERT(errno == EEXIST);
-	
+			if (errno != EEXIST) {
+				fprintf(stderr,"Error making starpu directory %s:\n", perf_model_dir);
+				perror("mkdir");
+				STARPU_ASSERT(0);
+			}
+
 			/* make sure that it is actually a directory */
 			/* make sure that it is actually a directory */
 			struct stat sb;
 			struct stat sb;
 			stat(perf_model_dir, &sb);
 			stat(perf_model_dir, &sb);
-			STARPU_ASSERT(S_ISDIR(sb.st_mode));
+			if (!S_ISDIR(sb.st_mode)) {
+				fprintf(stderr,"Error: %s is not a directory:\n", perf_model_dir);
+				STARPU_ASSERT(0);
+			}
 		}
 		}
-	
+
 		/* Per-task performance models */
 		/* Per-task performance models */
 		char perf_model_dir_codelets[256];
 		char perf_model_dir_codelets[256];
 		_starpu_get_perf_model_dir_codelets(perf_model_dir_codelets, 256);
 		_starpu_get_perf_model_dir_codelets(perf_model_dir_codelets, 256);
@@ -310,14 +530,22 @@ void _starpu_create_sampling_directory_if_needed(void)
 		ret = _starpu_mkpath(perf_model_dir_codelets, S_IRWXU);
 		ret = _starpu_mkpath(perf_model_dir_codelets, S_IRWXU);
 		if (ret == -1)
 		if (ret == -1)
 		{
 		{
-			STARPU_ASSERT(errno == EEXIST);
-	
+			if (errno != EEXIST) {
+				fprintf(stderr,"Error making starpu directory %s:\n", perf_model_dir);
+				perror("mkdir");
+				STARPU_ASSERT(0);
+			}
+
+
 			/* make sure that it is actually a directory */
 			/* make sure that it is actually a directory */
 			struct stat sb;
 			struct stat sb;
 			stat(perf_model_dir_codelets, &sb);
 			stat(perf_model_dir_codelets, &sb);
-			STARPU_ASSERT(S_ISDIR(sb.st_mode));
+			if (!S_ISDIR(sb.st_mode)) {
+				fprintf(stderr,"Error: %s is not a directory:\n", perf_model_dir);
+				STARPU_ASSERT(0);
+			}
 		}
 		}
-	
+
 		/* Performance of the memory subsystem */
 		/* Performance of the memory subsystem */
 		char perf_model_dir_bus[256];
 		char perf_model_dir_bus[256];
 		_starpu_get_perf_model_dir_bus(perf_model_dir_bus, 256);
 		_starpu_get_perf_model_dir_bus(perf_model_dir_bus, 256);
@@ -325,14 +553,21 @@ void _starpu_create_sampling_directory_if_needed(void)
 		ret = _starpu_mkpath(perf_model_dir_bus, S_IRWXU);
 		ret = _starpu_mkpath(perf_model_dir_bus, S_IRWXU);
 		if (ret == -1)
 		if (ret == -1)
 		{
 		{
-			STARPU_ASSERT(errno == EEXIST);
-	
+			if (errno != EEXIST) {
+				fprintf(stderr,"Error making starpu directory %s:\n", perf_model_dir);
+				perror("mkdir");
+				STARPU_ASSERT(0);
+			}
+
 			/* make sure that it is actually a directory */
 			/* make sure that it is actually a directory */
 			struct stat sb;
 			struct stat sb;
 			stat(perf_model_dir_bus, &sb);
 			stat(perf_model_dir_bus, &sb);
-			STARPU_ASSERT(S_ISDIR(sb.st_mode));
+			if (!S_ISDIR(sb.st_mode)) {
+				fprintf(stderr,"Error: %s is not a directory:\n", perf_model_dir);
+				STARPU_ASSERT(0);
+			}
 		}
 		}
-	
+
 		/* Performance debug measurements */
 		/* Performance debug measurements */
 		char perf_model_dir_debug[256];
 		char perf_model_dir_debug[256];
 		_starpu_get_perf_model_dir_debug(perf_model_dir_debug, 256);
 		_starpu_get_perf_model_dir_debug(perf_model_dir_debug, 256);
@@ -340,14 +575,22 @@ void _starpu_create_sampling_directory_if_needed(void)
 		ret = _starpu_mkpath(perf_model_dir_debug, S_IRWXU);
 		ret = _starpu_mkpath(perf_model_dir_debug, S_IRWXU);
 		if (ret == -1)
 		if (ret == -1)
 		{
 		{
-			STARPU_ASSERT(errno == EEXIST);
-	
+			if (errno != EEXIST) {
+				fprintf(stderr,"Error making starpu directory %s:\n", perf_model_dir);
+				perror("mkdir");
+				STARPU_ASSERT(0);
+			}
+
+
 			/* make sure that it is actually a directory */
 			/* make sure that it is actually a directory */
 			struct stat sb;
 			struct stat sb;
 			stat(perf_model_dir_debug, &sb);
 			stat(perf_model_dir_debug, &sb);
-			STARPU_ASSERT(S_ISDIR(sb.st_mode));
+			if (!S_ISDIR(sb.st_mode)) {
+				fprintf(stderr,"Error: %s is not a directory:\n", perf_model_dir);
+				STARPU_ASSERT(0);
+			}
 		}
 		}
-	
+
 		directory_existence_was_tested = 1;
 		directory_existence_was_tested = 1;
 	}
 	}
 }
 }

+ 23 - 61
src/core/perfmodel/perfmodel.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,67 +24,22 @@
 #include <starpu_perfmodel.h>
 #include <starpu_perfmodel.h>
 //#include <core/jobs.h>
 //#include <core/jobs.h>
 #include <common/htable32.h>
 #include <common/htable32.h>
+#include <core/task_bundle.h>
 //#include <core/workers.h>
 //#include <core/workers.h>
 #include <pthread.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdio.h>
 
 
-struct starpu_buffer_descr_t;
-struct starpu_jobq_s;
-struct starpu_job_s;
+struct starpu_buffer_descr;
+struct _starpu_job;
 enum starpu_perf_archtype;
 enum starpu_perf_archtype;
 
 
-struct starpu_history_entry_t {
-	//double measured;
-	
-	/* mean_n = 1/n sum */
-	double mean;
-
-	/* n dev_n = sum2 - 1/n (sum)^2 */
-	double deviation;
-
-	/* sum of samples */
-	double sum;
-
-	/* sum of samples^2 */
-	double sum2;
-
-//	/* sum of ln(measured) */
-//	double sumlny;
-//
-//	/* sum of ln(size) */
-//	double sumlnx;
-//	double sumlnx2;
-//
-//	/* sum of ln(size) ln(measured) */
-//	double sumlnxlny;
-//
-	unsigned nsample;
-
-	uint32_t footprint;
-#ifdef STARPU_HAVE_WINDOWS
-	unsigned size; /* in bytes */
-#else
-	size_t size; /* in bytes */
-#endif
-};
-
-struct starpu_history_list_t {
-	struct starpu_history_list_t *next;
-	struct starpu_history_entry_t *entry;
-};
-
-struct starpu_model_list_t {
-	struct starpu_model_list_t *next;
-	struct starpu_perfmodel_t *model;
-};
-
-//
 ///* File format */
 ///* File format */
-//struct model_file_format {
+//struct model_file_format
+// {
 //	unsigned ncore_entries;
 //	unsigned ncore_entries;
 //	unsigned ncuda_entries;
 //	unsigned ncuda_entries;
 //	/* contains core entries, then cuda ones */
 //	/* contains core entries, then cuda ones */
-//	struct starpu_history_entry_t entries[];
+//	struct starpu_history_entry entries[];
 //}
 //}
 
 
 void _starpu_get_perf_model_dir(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir(char *path, size_t maxlen);
@@ -92,18 +47,18 @@ void _starpu_get_perf_model_dir_codelets(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 
 
-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
-void _starpu_register_model(struct starpu_perfmodel_t *model);
-void _starpu_load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history);
-void _starpu_load_perfmodel(struct starpu_perfmodel_t *model);
+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl);
+int _starpu_register_model(struct starpu_perfmodel *model);
+void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned scan_history);
+void _starpu_load_perfmodel(struct starpu_perfmodel *model);
 void _starpu_initialize_registered_performance_models(void);
 void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 
 
-double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *model,
-					enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
-double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel_t *model,
-					enum starpu_perf_archtype arch, struct starpu_job_s *j, unsigned nimpl);
-void _starpu_update_perfmodel_history(struct starpu_job_s *j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch,
+double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
+					enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl);
+double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model,
+					enum starpu_perf_archtype arch, struct _starpu_job *j, unsigned nimpl);
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perf_archtype arch,
 				unsigned cpuid, double measured, unsigned nimpl);
 				unsigned cpuid, double measured, unsigned nimpl);
 
 
 void _starpu_create_sampling_directory_if_needed(void);
 void _starpu_create_sampling_directory_if_needed(void);
@@ -111,6 +66,13 @@ void _starpu_create_sampling_directory_if_needed(void);
 void _starpu_load_bus_performance_files(void);
 void _starpu_load_bus_performance_files(void);
 double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size);
 double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size);
 
 
+/* Return the expected duration of the entire task bundle in µs. */
+double _starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
+/* Return the time (in µs) expected to transfer all data used within the bundle */
+double _starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
+/* Return the expected power consumption of the entire task bundle in J. */
+double _starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
+
 void _starpu_set_calibrate_flag(unsigned val);
 void _starpu_set_calibrate_flag(unsigned val);
 unsigned _starpu_get_calibrate_flag(void);
 unsigned _starpu_get_calibrate_flag(void);
 
 

+ 232 - 114
src/core/perfmodel/perfmodel_bus.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -46,14 +46,16 @@
 
 
 #define MAXCPUS	32
 #define MAXCPUS	32
 
 
-struct dev_timing {
+/* timing is in µs per byte (i.e. slowness, inverse of bandwidth) */
+struct dev_timing
+{
 	int cpu_id;
 	int cpu_id;
 	double timing_htod;
 	double timing_htod;
 	double timing_dtoh;
 	double timing_dtoh;
 };
 };
 
 
-static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
-static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
+static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{NAN}};
+static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{NAN}};
 static unsigned was_benchmarked = 0;
 static unsigned was_benchmarked = 0;
 static unsigned ncpus = 0;
 static unsigned ncpus = 0;
 static int ncuda = 0;
 static int ncuda = 0;
@@ -65,15 +67,16 @@ static int nopencl = 0;
 static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
 static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
+#ifdef HAVE_CUDA_MEMCPY_PEER
+static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
+#endif
 static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
 static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
-static size_t cuda_size = SIZE;
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
 static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
 static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
-static size_t opencl_size = SIZE;
 #endif
 #endif
 
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
@@ -85,8 +88,9 @@ static hwloc_topology_t hwtopology;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
 {
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
+	size_t size = SIZE;
 
 
 	/* Initiliaze CUDA context on the device */
 	/* Initiliaze CUDA context on the device */
 	cudaSetDevice(dev);
 	cudaSetDevice(dev);
@@ -105,34 +109,31 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	cudaError_t cures;
 	cudaError_t cures;
 	cures = cudaGetDeviceProperties(&prop, dev);
 	cures = cudaGetDeviceProperties(&prop, dev);
 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
-        if (cuda_size > prop.totalGlobalMem/4) cuda_size = prop.totalGlobalMem/4;
+        if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
 
 
 	/* Allocate a buffer on the device */
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
 	unsigned char *d_buffer;
-	cudaMalloc((void **)&d_buffer, cuda_size);
-	assert(d_buffer);
+	cudaMalloc((void **)&d_buffer, size);
+	STARPU_ASSERT(d_buffer);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
 
 
-
 	/* Allocate a buffer on the host */
 	/* Allocate a buffer on the host */
 	unsigned char *h_buffer;
 	unsigned char *h_buffer;
-	cudaHostAlloc((void **)&h_buffer, cuda_size, 0);
-	assert(h_buffer);
+	cures = cudaHostAlloc((void **)&h_buffer, size, 0);
+	STARPU_ASSERT(cures == cudaSuccess);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
 
 
-
 	/* Fill them */
 	/* Fill them */
-	memset(h_buffer, 0, cuda_size);
-	cudaMemset(d_buffer, 0, cuda_size);
+	memset(h_buffer, 0, size);
+	cudaMemset(d_buffer, 0, size);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
 
 
-
 	unsigned iter;
 	unsigned iter;
 	double timing;
 	double timing;
 	struct timeval start;
 	struct timeval start;
@@ -142,25 +143,25 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	for (iter = 0; iter < NITER; iter++)
 	{
 	{
-		cudaMemcpy(d_buffer, h_buffer, cuda_size, cudaMemcpyHostToDevice);
+		cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
 		cudaThreadSynchronize();
 		cudaThreadSynchronize();
 	}
 	}
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
 
 
 	/* Measure download bandwidth */
 	/* Measure download bandwidth */
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	for (iter = 0; iter < NITER; iter++)
 	{
 	{
-		cudaMemcpy(h_buffer, d_buffer, cuda_size, cudaMemcpyDeviceToHost);
+		cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
 		cudaThreadSynchronize();
 		cudaThreadSynchronize();
 	}
 	}
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
 
 
 	/* Free buffers */
 	/* Free buffers */
 	cudaFreeHost(h_buffer);
 	cudaFreeHost(h_buffer);
@@ -168,6 +169,65 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
 
 	cudaThreadExit();
 	cudaThreadExit();
 }
 }
+
+#ifdef HAVE_CUDA_MEMCPY_PEER
+static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
+{
+	size_t size = SIZE;
+
+        /* Get the maximum size which can be allocated on the device */
+	struct cudaDeviceProp prop;
+	cudaError_t cures;
+	cures = cudaGetDeviceProperties(&prop, src);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
+        if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
+	cures = cudaGetDeviceProperties(&prop, dst);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
+        if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
+
+	/* Initiliaze CUDA context on the source */
+	cudaSetDevice(src);
+
+	/* Allocate a buffer on the device */
+	unsigned char *s_buffer;
+	cudaMalloc((void **)&s_buffer, size);
+	STARPU_ASSERT(s_buffer);
+	cudaMemset(s_buffer, 0, size);
+
+	/* Initiliaze CUDA context on the destination */
+	cudaSetDevice(dst);
+
+	/* Allocate a buffer on the device */
+	unsigned char *d_buffer;
+	cudaMalloc((void **)&d_buffer, size);
+	STARPU_ASSERT(d_buffer);
+	cudaMemset(d_buffer, 0, size);
+
+	unsigned iter;
+	double timing;
+	struct timeval start;
+	struct timeval end;
+
+	/* Measure upload bandwidth */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; iter++)
+	{
+		cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
+		cudaThreadSynchronize();
+	}
+	gettimeofday(&end, NULL);
+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	cudadev_timing_dtod[src+1][dst+1] = timing/NITER/size;
+
+	/* Free buffers */
+	cudaFree(d_buffer);
+	cudaSetDevice(src);
+	cudaFree(s_buffer);
+
+	cudaThreadExit();
+}
+#endif
 #endif
 #endif
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
@@ -176,8 +236,9 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
         cl_context context;
         cl_context context;
         cl_command_queue queue;
         cl_command_queue queue;
         cl_int err=0;
         cl_int err=0;
+	size_t size = SIZE;
 
 
-        struct starpu_machine_config_s *config = _starpu_get_machine_config();
+        struct _starpu_machine_config *config = _starpu_get_machine_config();
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
 
 
 	/* Initialize OpenCL context on the device */
 	/* Initialize OpenCL context on the device */
@@ -191,28 +252,28 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
         starpu_opencl_get_device(dev, &device);
         starpu_opencl_get_device(dev, &device);
 	err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
 	err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-        if (opencl_size > (size_t)maxMemAllocSize/4) opencl_size = maxMemAllocSize/4;
+        if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
 
 
 	/* Allocate a buffer on the device */
 	/* Allocate a buffer on the device */
 	cl_mem d_buffer;
 	cl_mem d_buffer;
-	d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, opencl_size, NULL, &err);
+	d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
         /* Allocate a buffer on the host */
         /* Allocate a buffer on the host */
 	unsigned char *h_buffer;
 	unsigned char *h_buffer;
-        h_buffer = malloc(opencl_size);
-	assert(h_buffer);
+        h_buffer = (unsigned char *)malloc(size);
+	STARPU_ASSERT(h_buffer);
 
 
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
         /* Fill them */
         /* Fill them */
-	memset(h_buffer, 0, opencl_size);
-        err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, opencl_size, h_buffer, 0, NULL, NULL);
+	memset(h_buffer, 0, size);
+        err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	/* hack to avoid third party libs to rebind threads */
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(config, cpu);
 	_starpu_bind_thread_on_cpu(config, cpu);
@@ -226,25 +287,25 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	for (iter = 0; iter < NITER; iter++)
 	{
 	{
-                err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, opencl_size, h_buffer, 0, NULL, NULL);
+                err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	}
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
 
 
 	/* Measure download bandwidth */
 	/* Measure download bandwidth */
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	for (iter = 0; iter < NITER; iter++)
 	{
 	{
-                err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, opencl_size, h_buffer, 0, NULL, NULL);
+                err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	}
 	}
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER;
+	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
 
 
 	/* Free buffers */
 	/* Free buffers */
 	clReleaseMemObject(d_buffer);
 	clReleaseMemObject(d_buffer);
@@ -258,8 +319,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 /* NB: we want to sort the bandwidth by DECREASING order */
 /* NB: we want to sort the bandwidth by DECREASING order */
 static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
 static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
 {
 {
-	const struct dev_timing *left = left_dev_timing;
-	const struct dev_timing *right = right_dev_timing;
+	const struct dev_timing *left = (const struct dev_timing *)left_dev_timing;
+	const struct dev_timing *right = (const struct dev_timing *)right_dev_timing;
 
 
 	double left_dtoh = left->timing_dtoh;
 	double left_dtoh = left->timing_dtoh;
 	double left_htod = left->timing_htod;
 	double left_htod = left->timing_htod;
@@ -291,7 +352,7 @@ static int find_numa_node(hwloc_obj_t obj)
 
 
 	STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
 	STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
 
 
-	return current->logical_index; 
+	return current->logical_index;
 }
 }
 #endif
 #endif
 
 
@@ -308,12 +369,24 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 	/* If no NUMA node was found, we assume that we have a single memory
 	/* If no NUMA node was found, we assume that we have a single memory
 	 * bank. */
 	 * bank. */
 	const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
 	const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
-	
-	unsigned is_available_per_numa_node[nnuma_nodes];
-	double dev_timing_htod_per_numa_node[nnuma_nodes];
-	double dev_timing_dtoh_per_numa_node[nnuma_nodes];
 
 
-	memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
+	unsigned *is_available_per_numa_node = NULL;
+	double *dev_timing_htod_per_numa_node = NULL;
+	double *dev_timing_dtoh_per_numa_node = NULL;
+
+	if (!no_node_obj_was_found)
+	{
+		is_available_per_numa_node = (unsigned *)malloc(nnuma_nodes * sizeof(unsigned));
+		STARPU_ASSERT(is_available_per_numa_node);
+
+		dev_timing_htod_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
+		STARPU_ASSERT(dev_timing_htod_per_numa_node);
+
+		dev_timing_dtoh_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
+		STARPU_ASSERT(dev_timing_dtoh_per_numa_node);
+
+		memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
+	}
 #endif
 #endif
 
 
 	unsigned cpu;
 	unsigned cpu;
@@ -327,9 +400,9 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 		if (!no_node_obj_was_found)
 		if (!no_node_obj_was_found)
 		{
 		{
 			hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
 			hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
-	
+
 			numa_id = find_numa_node(obj);
 			numa_id = find_numa_node(obj);
-	
+
 			if (is_available_per_numa_node[numa_id])
 			if (is_available_per_numa_node[numa_id])
 			{
 			{
 				/* We reuse the previous numbers for that NUMA node */
 				/* We reuse the previous numbers for that NUMA node */
@@ -364,6 +437,15 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 		}
 		}
 #endif
 #endif
         }
         }
+
+#ifdef STARPU_HAVE_HWLOC
+	if (!no_node_obj_was_found)
+	{
+		free(is_available_per_numa_node);
+		free(dev_timing_htod_per_numa_node);
+		free(dev_timing_dtoh_per_numa_node);
+	}
+#endif /* STARPU_HAVE_HWLOC */
 }
 }
 
 
 static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
 static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
@@ -386,7 +468,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
 
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 
 
-		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
+		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 	}
 	}
 
 
 	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
 	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
@@ -405,6 +487,9 @@ static void benchmark_all_gpu_devices(void)
 {
 {
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	int i;
 	int i;
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	int j;
+#endif
 
 
 	_STARPU_DEBUG("Benchmarking the speed of the bus\n");
 	_STARPU_DEBUG("Benchmarking the speed of the bus\n");
 
 
@@ -428,21 +513,33 @@ static void benchmark_all_gpu_devices(void)
 #warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
 #warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
 #endif
 #endif
 
 
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	ncpus = _starpu_topology_get_nhwcpu(config);
 	ncpus = _starpu_topology_get_nhwcpu(config);
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-        cudaGetDeviceCount(&ncuda);
+	ncuda = _starpu_get_cuda_device_count();
 	for (i = 0; i < ncuda; i++)
 	for (i = 0; i < ncuda; i++)
 	{
 	{
+		fprintf(stderr," CUDA %d...", i);
 		/* measure bandwidth between Host and Device i */
 		/* measure bandwidth between Host and Device i */
 		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
 		measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
 	}
 	}
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	for (i = 0; i < ncuda; i++)
+		for (j = 0; j < ncuda; j++)
+			if (i != j)
+			{
+				fprintf(stderr," CUDA %d -> %d...", i, j);
+				/* measure bandwidth between Host and Device i */
+				measure_bandwidth_between_dev_and_dev_cuda(i, j);
+			}
+#endif
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         nopencl = _starpu_opencl_get_device_count();
         nopencl = _starpu_opencl_get_device_count();
 	for (i = 0; i < nopencl; i++)
 	for (i = 0; i < nopencl; i++)
 	{
 	{
+		fprintf(stderr," OpenCL %d...", i);
 		/* measure bandwith between Host and Device i */
 		/* measure bandwith between Host and Device i */
 		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
 		measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
 	}
 	}
@@ -477,7 +574,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 	char hostname[32];
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	if (forced_hostname && forced_hostname[0])
 	if (forced_hostname && forced_hostname[0])
-		snprintf(hostname, sizeof(hostname), forced_hostname);
+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 	else
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
 	strncat(path, ".", maxlen);
@@ -495,6 +592,7 @@ static void get_affinity_path(char *path, size_t maxlen)
 
 
 static void load_bus_affinity_file_content(void)
 static void load_bus_affinity_file_content(void)
 {
 {
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	FILE *f;
 	FILE *f;
 
 
 	char path[256];
 	char path[256];
@@ -503,13 +601,12 @@ static void load_bus_affinity_file_content(void)
 	f = fopen(path, "r");
 	f = fopen(path, "r");
 	STARPU_ASSERT(f);
 	STARPU_ASSERT(f);
 
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	ncpus = _starpu_topology_get_nhwcpu(config);
 	ncpus = _starpu_topology_get_nhwcpu(config);
         int gpu;
         int gpu;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-        cudaGetDeviceCount(&ncuda);
+	ncuda = _starpu_get_cuda_device_count();
 	for (gpu = 0; gpu < ncuda; gpu++)
 	for (gpu = 0; gpu < ncuda; gpu++)
 	{
 	{
 		int ret;
 		int ret;
@@ -532,7 +629,7 @@ static void load_bus_affinity_file_content(void)
 		ret = fscanf(f, "\n");
 		ret = fscanf(f, "\n");
 		STARPU_ASSERT(ret == 0);
 		STARPU_ASSERT(ret == 0);
 	}
 	}
-#endif
+#endif /* !STARPU_USE_CUDA */
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         nopencl = _starpu_opencl_get_device_count();
         nopencl = _starpu_opencl_get_device_count();
 	for (gpu = 0; gpu < nopencl; gpu++)
 	for (gpu = 0; gpu < nopencl; gpu++)
@@ -557,21 +654,21 @@ static void load_bus_affinity_file_content(void)
 		ret = fscanf(f, "\n");
 		ret = fscanf(f, "\n");
 		STARPU_ASSERT(ret == 0);
 		STARPU_ASSERT(ret == 0);
 	}
 	}
-#endif
-#endif
+#endif /* !STARPU_USE_OPENCL */
 
 
 	fclose(f);
 	fclose(f);
+#endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
+
 }
 }
 
 
 static void write_bus_affinity_file_content(void)
 static void write_bus_affinity_file_content(void)
 {
 {
-	FILE *f;
-
 	STARPU_ASSERT(was_benchmarked);
 	STARPU_ASSERT(was_benchmarked);
 
 
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+	FILE *f;
 	char path[256];
 	char path[256];
 	get_affinity_path(path, 256);
 	get_affinity_path(path, 256);
-
 	f = fopen(path, "w+");
 	f = fopen(path, "w+");
 	if (!f)
 	if (!f)
 	{
 	{
@@ -581,7 +678,6 @@ static void write_bus_affinity_file_content(void)
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
 
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	unsigned cpu;
 	unsigned cpu;
         int gpu;
         int gpu;
 
 
@@ -689,12 +785,14 @@ static int load_bus_latency_file_content(void)
 			double latency;
 			double latency;
 
 
 			n = fscanf(f, "%lf", &latency);
 			n = fscanf(f, "%lf", &latency);
-			if (n != 1) {
+			if (n != 1)
+			{
 				fclose(f);
 				fclose(f);
 				return 0;
 				return 0;
 			}
 			}
 			n = getc(f);
 			n = getc(f);
-			if (n != '\t') {
+			if (n != '\t')
+			{
 				fclose(f);
 				fclose(f);
 				return 0;
 				return 0;
 			}
 			}
@@ -703,7 +801,8 @@ static int load_bus_latency_file_content(void)
 		}
 		}
 
 
 		n = getc(f);
 		n = getc(f);
-		if (n != '\n') {
+		if (n != '\n')
+		{
 			fclose(f);
 			fclose(f);
 			return 0;
 			return 0;
 		}
 		}
@@ -750,17 +849,19 @@ static void write_bus_latency_file_content(void)
 			if ((src > maxnode) || (dst > maxnode))
 			if ((src > maxnode) || (dst > maxnode))
 			{
 			{
 				/* convention */
 				/* convention */
-				latency = -1.0;
+				latency = NAN;
 			}
 			}
 			else if (src == dst)
 			else if (src == dst)
 			{
 			{
 				latency = 0.0;
 				latency = 0.0;
 			}
 			}
-			else {
+			else
+			{
+				/* µs */
                                 latency = ((src && dst)?2000.0:500.0);
                                 latency = ((src && dst)?2000.0:500.0);
 			}
 			}
 
 
-			fprintf(f, "%lf\t", latency);
+			fprintf(f, "%f\t", latency);
 		}
 		}
 
 
 		fprintf(f, "\n");
 		fprintf(f, "\n");
@@ -828,13 +929,15 @@ static int load_bus_bandwidth_file_content(void)
 			double bandwidth;
 			double bandwidth;
 
 
 			n = fscanf(f, "%lf", &bandwidth);
 			n = fscanf(f, "%lf", &bandwidth);
-			if (n != 1) {
+			if (n != 1)
+			{
 				fprintf(stderr,"didn't get a number\n");
 				fprintf(stderr,"didn't get a number\n");
 				fclose(f);
 				fclose(f);
 				return 0;
 				return 0;
 			}
 			}
 			n = getc(f);
 			n = getc(f);
-			if (n != '\t') {
+			if (n != '\t')
+			{
 				fclose(f);
 				fclose(f);
 				return 0;
 				return 0;
 			}
 			}
@@ -843,7 +946,8 @@ static int load_bus_bandwidth_file_content(void)
 		}
 		}
 
 
 		n = getc(f);
 		n = getc(f);
-		if (n != '\n') {
+		if (n != '\n')
+		{
 			fclose(f);
 			fclose(f);
 			return 0;
 			return 0;
 		}
 		}
@@ -883,36 +987,43 @@ static void write_bus_bandwidth_file_content(void)
 
 
 			if ((src > maxnode) || (dst > maxnode))
 			if ((src > maxnode) || (dst > maxnode))
 			{
 			{
-				bandwidth = -1.0;
+				bandwidth = NAN;
 			}
 			}
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 			else if (src != dst)
 			else if (src != dst)
 			{
 			{
-                                double time_src_to_ram=0.0, time_ram_to_dst=0.0;
-                                double timing;
-                                /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
+				double slowness = 0.0;
+				/* Total bandwidth is the harmonic mean of bandwidths */
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
-                                time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
-				timing =time_src_to_ram + time_ram_to_dst;
-				bandwidth = 1.0*cuda_size/timing;
+#ifdef HAVE_CUDA_MEMCPY_PEER
+				if (src && src <= ncuda && dst && dst <= ncuda)
+					/* Direct GPU-GPU transfert */
+					slowness = cudadev_timing_dtod[src][dst];
+				else
+#endif
+				{
+					if (src && src <= ncuda)
+						slowness += cudadev_timing_dtoh[src];
+					if (dst && dst <= ncuda)
+						slowness += cudadev_timing_htod[dst];
+				}
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-                                if (src > ncuda)
-                                        time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
-                                if (dst > ncuda)
-                                        time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
-				timing =time_src_to_ram + time_ram_to_dst;
-				bandwidth = 1.0*opencl_size/timing;
+				if (src > ncuda)
+					slowness += opencldev_timing_dtoh[src-ncuda];
+				if (dst > ncuda)
+					slowness += opencldev_timing_htod[dst-ncuda];
 #endif
 #endif
+				bandwidth = 1.0/slowness;
 			}
 			}
 #endif
 #endif
-			else {
+			else
+			{
 			        /* convention */
 			        /* convention */
 			        bandwidth = 0.0;
 			        bandwidth = 0.0;
 			}
 			}
 
 
-			fprintf(f, "%lf\t", bandwidth);
+			fprintf(f, "%f\t", bandwidth);
 		}
 		}
 
 
 		fprintf(f, "\n");
 		fprintf(f, "\n");
@@ -921,37 +1032,38 @@ static void write_bus_bandwidth_file_content(void)
 	fclose(f);
 	fclose(f);
 }
 }
 
 
-void starpu_print_bus_bandwidth(FILE *f)
+void starpu_bus_print_bandwidth(FILE *f)
 {
 {
-  int src, dst, maxnode;
+	int src, dst, maxnode;
 
 
-  maxnode = ncuda;
+        maxnode = ncuda;
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-  maxnode += nopencl;
+        maxnode += nopencl;
 #endif
 #endif
 
 
-  fprintf(f, "from\t");
-  fprintf(f, "to RAM\t\t");
-  for (dst = 0; dst < ncuda; dst++)
-    fprintf(f, "to CUDA %d\t", dst);
-  for (dst = 0; dst < nopencl; dst++)
-    fprintf(f, "to OpenCL %d\t", dst);
-  fprintf(f, "\n");
-
-  for (src = 0; src <= maxnode; src++)
-    {
-      if (!src)
-	fprintf(f, "RAM\t");
-      else if (src <= ncuda)
-	fprintf(f, "CUDA %d\t", src-1);
-      else
-	fprintf(f, "OpenCL%d\t", src-ncuda-1);
-      for (dst = 0; dst <= maxnode; dst++)
-	fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
-
-      fprintf(f, "\n");
-    }
+	fprintf(f, "from\t");
+	fprintf(f, "to RAM\t\t");
+	for (dst = 0; dst < ncuda; dst++)
+		fprintf(f, "to CUDA %d\t", dst);
+	for (dst = 0; dst < nopencl; dst++)
+		fprintf(f, "to OpenCL %d\t", dst);
+	fprintf(f, "\n");
+
+	for (src = 0; src <= maxnode; src++)
+	{
+		if (!src)
+			fprintf(f, "RAM\t");
+		else if (src <= ncuda)
+			fprintf(f, "CUDA %d\t", src-1);
+		else
+			fprintf(f, "OpenCL%d\t", src-ncuda-1);
+		for (dst = 0; dst <= maxnode; dst++)
+			fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
+
+		fprintf(f, "\n");
+	}
 }
 }
+
 static void generate_bus_bandwidth_file(void)
 static void generate_bus_bandwidth_file(void)
 {
 {
 	if (!was_benchmarked)
 	if (!was_benchmarked)
@@ -990,16 +1102,18 @@ static void check_bus_config_file()
 
 
         get_config_path(path, 256);
         get_config_path(path, 256);
         res = access(path, F_OK);
         res = access(path, F_OK);
-        if (res) {
+        if (res)
+	{
 		fprintf(stderr, "No performance model for the bus, calibrating...");
 		fprintf(stderr, "No performance model for the bus, calibrating...");
 		starpu_force_bus_sampling();
 		starpu_force_bus_sampling();
 		fprintf(stderr, "done\n");
 		fprintf(stderr, "done\n");
         }
         }
-        else {
+        else
+	{
                 FILE *f;
                 FILE *f;
                 int ret, read_cuda, read_opencl;
                 int ret, read_cuda, read_opencl;
                 unsigned read_cpus;
                 unsigned read_cpus;
-                struct starpu_machine_config_s *config = _starpu_get_machine_config();
+                struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
                 // Loading configuration from file
                 // Loading configuration from file
                 f = fopen(path, "r");
                 f = fopen(path, "r");
@@ -1019,24 +1133,27 @@ static void check_bus_config_file()
                 // Loading current configuration
                 // Loading current configuration
                 ncpus = _starpu_topology_get_nhwcpu(config);
                 ncpus = _starpu_topology_get_nhwcpu(config);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-                cudaGetDeviceCount(&ncuda);
+		ncuda = _starpu_get_cuda_device_count();
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
                 nopencl = _starpu_opencl_get_device_count();
                 nopencl = _starpu_opencl_get_device_count();
 #endif
 #endif
 
 
                 // Checking if both configurations match
                 // Checking if both configurations match
-                if (read_cpus != ncpus) {
+                if (read_cpus != ncpus)
+		{
 			fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
 			fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
                         starpu_force_bus_sampling();
                         starpu_force_bus_sampling();
 			fprintf(stderr, "done\n");
 			fprintf(stderr, "done\n");
                 }
                 }
-                else if (read_cuda != ncuda) {
+                else if (read_cuda != ncuda)
+		{
                         fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
                         fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
                         starpu_force_bus_sampling();
                         starpu_force_bus_sampling();
 			fprintf(stderr, "done\n");
 			fprintf(stderr, "done\n");
                 }
                 }
-                else if (read_opencl != nopencl) {
+                else if (read_opencl != nopencl)
+		{
                         fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
                         fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
                         starpu_force_bus_sampling();
                         starpu_force_bus_sampling();
 			fprintf(stderr, "done\n");
 			fprintf(stderr, "done\n");
@@ -1094,11 +1211,12 @@ void _starpu_load_bus_performance_files(void)
 	load_bus_bandwidth_file();
 	load_bus_bandwidth_file();
 }
 }
 
 
+/* (in µs) */
 double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
 double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
 {
 {
 	double bandwidth = bandwidth_matrix[src_node][dst_node];
 	double bandwidth = bandwidth_matrix[src_node][dst_node];
 	double latency = latency_matrix[src_node][dst_node];
 	double latency = latency_matrix[src_node][dst_node];
-	struct starpu_machine_topology_s *topology = &_starpu_get_machine_config()->topology;
+	struct starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
 
 
 	return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
 	return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
 }
 }

文件差異過大導致無法顯示
+ 366 - 221
src/core/perfmodel/perfmodel_history.c


+ 16 - 17
src/core/perfmodel/regression.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -112,12 +112,13 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 	return r;
 	return r;
 }
 }
 
 
-static unsigned find_list_size(struct starpu_history_list_t *list_history)
+static unsigned find_list_size(struct starpu_history_list *list_history)
 {
 {
 	unsigned cnt = 0;
 	unsigned cnt = 0;
 
 
-	struct starpu_history_list_t *ptr = list_history;
-	while (ptr) {
+	struct starpu_history_list *ptr = list_history;
+	while (ptr)
+	{
 		cnt++;
 		cnt++;
 		ptr = ptr->next;
 		ptr = ptr->next;
 	}
 	}
@@ -138,12 +139,13 @@ static double find_list_min(double *y, unsigned n)
 	return min;
 	return min;
 }
 }
 
 
-static void dump_list(unsigned *x, double *y, struct starpu_history_list_t *list_history)
+static void dump_list(unsigned *x, double *y, struct starpu_history_list *list_history)
 {
 {
-	struct starpu_history_list_t *ptr = list_history;
+	struct starpu_history_list *ptr = list_history;
 	unsigned i = 0;
 	unsigned i = 0;
 
 
-	while (ptr) {
+	while (ptr)
+	{
 		x[i] = ptr->entry->size;
 		x[i] = ptr->entry->size;
 		y[i] = ptr->entry->mean;
 		y[i] = ptr->entry->mean;
 
 
@@ -153,11 +155,11 @@ static void dump_list(unsigned *x, double *y, struct starpu_history_list_t *list
 }
 }
 
 
 
 
-/* y = ax^b + c 
+/* y = ax^b + c
  * 	return 0 if success, -1 otherwise
  * 	return 0 if success, -1 otherwise
  * 	if success, a, b and c are modified
  * 	if success, a, b and c are modified
  * */
  * */
-int _starpu_regression_non_linear_power(struct starpu_history_list_t *ptr, double *a, double *b, double *c)
+int _starpu_regression_non_linear_power(struct starpu_history_list *ptr, double *a, double *b, double *c)
 {
 {
 	unsigned n = find_list_size(ptr);
 	unsigned n = find_list_size(ptr);
 
 
@@ -171,7 +173,7 @@ int _starpu_regression_non_linear_power(struct starpu_history_list_t *ptr, doubl
 
 
 	double cmin = 0.0;
 	double cmin = 0.0;
 	double cmax = find_list_min(y, n);
 	double cmax = find_list_min(y, n);
-	
+
 	unsigned iter;
 	unsigned iter;
 
 
 	double err = 100000.0;
 	double err = 100000.0;
@@ -180,7 +182,7 @@ int _starpu_regression_non_linear_power(struct starpu_history_list_t *ptr, doubl
 	{
 	{
 		double c1, c2;
 		double c1, c2;
 		double r1, r2;
 		double r1, r2;
-		
+
 		double radius = 0.01;
 		double radius = 0.01;
 
 
 		c1 = cmin + (0.5-radius)*(cmax - cmin);
 		c1 = cmin + (0.5-radius)*(cmax - cmin);
@@ -197,23 +199,21 @@ int _starpu_regression_non_linear_power(struct starpu_history_list_t *ptr, doubl
 		{
 		{
 			cmax = (cmin + cmax)/2;
 			cmax = (cmin + cmax)/2;
 		}
 		}
-		else {
+		else
+		{
 			/* 2 is better */
 			/* 2 is better */
 			cmin = (cmin + cmax)/2;
 			cmin = (cmin + cmax)/2;
 		}
 		}
 
 
 		if (fabs(err - STARPU_MIN(err1, err2)) < EPS)
 		if (fabs(err - STARPU_MIN(err1, err2)) < EPS)
-		{
-			err = STARPU_MIN(err1, err2);
 			break;
 			break;
-		}
 
 
 		err = STARPU_MIN(err1, err2);
 		err = STARPU_MIN(err1, err2);
 	}
 	}
 
 
 	*c = (cmin + cmax)/2;
 	*c = (cmin + cmax)/2;
 
 
-	*b = compute_b(*c, n, x, y); 
+	*b = compute_b(*c, n, x, y);
 	*a = exp(compute_a(*c, *b, n, x, y));
 	*a = exp(compute_a(*c, *b, n, x, y));
 
 
 	free(x);
 	free(x);
@@ -221,4 +221,3 @@ int _starpu_regression_non_linear_power(struct starpu_history_list_t *ptr, doubl
 
 
 	return 0;
 	return 0;
 }
 }
-

+ 1 - 1
src/core/perfmodel/regression.h

@@ -24,6 +24,6 @@
 #include <core/perfmodel/perfmodel.h>
 #include <core/perfmodel/perfmodel.h>
 #include <starpu.h>
 #include <starpu.h>
 
 
-int _starpu_regression_non_linear_power(struct starpu_history_list_t *ptr, double *a, double *b, double *c);
+int _starpu_regression_non_linear_power(struct starpu_history_list *ptr, double *a, double *b, double *c);
 
 
 #endif // __REGRESSION_H__ 
 #endif // __REGRESSION_H__ 

+ 13 - 12
src/core/progress_hook.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,7 +21,8 @@
 
 
 #define NMAXHOOKS	16
 #define NMAXHOOKS	16
 
 
-struct progression_hook {
+struct progression_hook
+{
 	unsigned (*func)(void *arg);
 	unsigned (*func)(void *arg);
 	void *arg;
 	void *arg;
 	unsigned active;
 	unsigned active;
@@ -36,7 +37,7 @@ static int active_hook_cnt = 0;
 int starpu_progression_hook_register(unsigned (*func)(void *arg), void *arg)
 int starpu_progression_hook_register(unsigned (*func)(void *arg), void *arg)
 {
 {
 	int hook;
 	int hook;
-	PTHREAD_RWLOCK_WRLOCK(&progression_hook_rwlock);
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&progression_hook_rwlock);
 	for (hook = 0; hook < NMAXHOOKS; hook++)
 	for (hook = 0; hook < NMAXHOOKS; hook++)
 	{
 	{
 		if (!hooks[hook].active)
 		if (!hooks[hook].active)
@@ -47,13 +48,13 @@ int starpu_progression_hook_register(unsigned (*func)(void *arg), void *arg)
 			hooks[hook].active = 1;
 			hooks[hook].active = 1;
 			active_hook_cnt++;
 			active_hook_cnt++;
 
 
-			PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
-			
+			_STARPU_PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
+
 			return hook;
 			return hook;
 		}
 		}
 	}
 	}
 
 
-	PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
 
 
 	starpu_wake_all_blocked_workers();
 	starpu_wake_all_blocked_workers();
 
 
@@ -63,22 +64,22 @@ int starpu_progression_hook_register(unsigned (*func)(void *arg), void *arg)
 
 
 void starpu_progression_hook_deregister(int hook_id)
 void starpu_progression_hook_deregister(int hook_id)
 {
 {
-	PTHREAD_RWLOCK_WRLOCK(&progression_hook_rwlock);
+	_STARPU_PTHREAD_RWLOCK_WRLOCK(&progression_hook_rwlock);
 
 
 	if (hooks[hook_id].active)
 	if (hooks[hook_id].active)
 		active_hook_cnt--;
 		active_hook_cnt--;
 
 
 	hooks[hook_id].active = 0;
 	hooks[hook_id].active = 0;
 
 
-	PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
 }
 }
 
 
 unsigned _starpu_execute_registered_progression_hooks(void)
 unsigned _starpu_execute_registered_progression_hooks(void)
 {
 {
 	/* If there is no hook registered, we short-cut loop. */
 	/* If there is no hook registered, we short-cut loop. */
-	PTHREAD_RWLOCK_RDLOCK(&progression_hook_rwlock);
+	_STARPU_PTHREAD_RWLOCK_RDLOCK(&progression_hook_rwlock);
 	int no_hook = (active_hook_cnt == 0);
 	int no_hook = (active_hook_cnt == 0);
-	PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
+	_STARPU_PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
 
 
 	if (no_hook)
 	if (no_hook)
 		return 1;
 		return 1;
@@ -92,9 +93,9 @@ unsigned _starpu_execute_registered_progression_hooks(void)
 	{
 	{
 		unsigned active;
 		unsigned active;
 
 
-		PTHREAD_RWLOCK_RDLOCK(&progression_hook_rwlock);
+		_STARPU_PTHREAD_RWLOCK_RDLOCK(&progression_hook_rwlock);
 		active = hooks[hook].active;
 		active = hooks[hook].active;
-		PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
+		_STARPU_PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
 
 
 		unsigned may_block_hook = 1;
 		unsigned may_block_hook = 1;
 
 

+ 211 - 72
src/core/sched_policy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -36,17 +36,17 @@ int starpu_get_prefetch_flag(void)
  *	Predefined policies
  *	Predefined policies
  */
  */
 
 
-/* extern struct starpu_sched_policy_s _starpu_sched_ws_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_prio_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_random_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_dm_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_dmda_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_dmda_ready_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_eager_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_parallel_heft_policy; */
-/* extern struct starpu_sched_policy_s _starpu_sched_pgreedy_policy; */
-extern struct starpu_sched_policy_s heft_policy;
+/* extern struct starpu_sched_policy _starpu_sched_ws_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_prio_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_random_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_dm_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_dmda_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_eager_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy; */
+/* extern struct starpu_sched_policy _starpu_sched_pgreedy_policy; */
+extern struct starpu_sched_policy heft_policy;
 
 
 static struct starpu_sched_policy_s *predefined_policies[] = {
 static struct starpu_sched_policy_s *predefined_policies[] = {
 	/* &_starpu_sched_ws_policy, */
 	/* &_starpu_sched_ws_policy, */
@@ -62,7 +62,7 @@ static struct starpu_sched_policy_s *predefined_policies[] = {
 	/* &_starpu_sched_pgreedy_policy */
 	/* &_starpu_sched_pgreedy_policy */
 };
 };
 
 
-struct starpu_sched_policy_s *_starpu_get_sched_policy(struct starpu_sched_ctx *sched_ctx)
+struct starpu_sched_policy *_starpu_get_sched_policy(struct starpu_sched_ctx *sched_ctx)
 {
 {
 	return sched_ctx->sched_policy;
 	return sched_ctx->sched_policy;
 }
 }
@@ -71,7 +71,7 @@ struct starpu_sched_policy_s *_starpu_get_sched_policy(struct starpu_sched_ctx *
  *	Methods to initialize the scheduling policy
  *	Methods to initialize the scheduling policy
  */
  */
 
 
-static void load_sched_policy(struct starpu_sched_policy_s *sched_policy, struct starpu_sched_ctx *sched_ctx)
+static void load_sched_policy(struct starpu_sched_policy *sched_policy, struct starpu_sched_ctx *sched_ctx)
 {
 {
 	STARPU_ASSERT(sched_policy);
 	STARPU_ASSERT(sched_policy);
 
 
@@ -91,6 +91,7 @@ static void load_sched_policy(struct starpu_sched_policy_s *sched_policy, struct
 	policy->deinit_sched = sched_policy->deinit_sched;
 	policy->deinit_sched = sched_policy->deinit_sched;
 	policy->push_task = sched_policy->push_task;
 	policy->push_task = sched_policy->push_task;
 	policy->pop_task = sched_policy->pop_task;
 	policy->pop_task = sched_policy->pop_task;
+	policy->pre_exec_hook = sched_policy->pre_exec_hook;
 	policy->post_exec_hook = sched_policy->post_exec_hook;
 	policy->post_exec_hook = sched_policy->post_exec_hook;
 	policy->pop_every_task = sched_policy->pop_every_task;
 	policy->pop_every_task = sched_policy->pop_every_task;
 	policy->push_task_notify = sched_policy->push_task_notify;
 	policy->push_task_notify = sched_policy->push_task_notify;
@@ -99,20 +100,20 @@ static void load_sched_policy(struct starpu_sched_policy_s *sched_policy, struct
 	policy->remove_workers = sched_policy->remove_workers;
 	policy->remove_workers = sched_policy->remove_workers;
 }
 }
 
 
-static struct starpu_sched_policy_s *find_sched_policy_from_name(const char *policy_name)
+static struct starpu_sched_policy *find_sched_policy_from_name(const char *policy_name)
 {
 {
-
 	if (!policy_name)
 	if (!policy_name)
 		return NULL;
 		return NULL;
 
 
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
 	for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
 	{
 	{
-		struct starpu_sched_policy_s *p;
+		struct starpu_sched_policy *p;
 		p = predefined_policies[i];
 		p = predefined_policies[i];
 		if (p->policy_name)
 		if (p->policy_name)
 		{
 		{
-			if (strcmp(policy_name, p->policy_name) == 0) {
+			if (strcmp(policy_name, p->policy_name) == 0)
+			{
 				/* we found a policy with the requested name */
 				/* we found a policy with the requested name */
 				return p;
 				return p;
 			}
 			}
@@ -127,23 +128,24 @@ static struct starpu_sched_policy_s *find_sched_policy_from_name(const char *pol
 static void display_sched_help_message(void)
 static void display_sched_help_message(void)
 {
 {
 	const char *sched_env = getenv("STARPU_SCHED");
 	const char *sched_env = getenv("STARPU_SCHED");
-	if (sched_env && (strcmp(sched_env, "help") == 0)) {
+	if (sched_env && (strcmp(sched_env, "help") == 0))
+	{
 		fprintf(stderr, "STARPU_SCHED can be either of\n");
 		fprintf(stderr, "STARPU_SCHED can be either of\n");
 
 
 		/* display the description of all predefined policies */
 		/* display the description of all predefined policies */
 		unsigned i;
 		unsigned i;
 		for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
 		for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
 		{
 		{
-			struct starpu_sched_policy_s *p;
+			struct starpu_sched_policy *p;
 			p = predefined_policies[i];
 			p = predefined_policies[i];
 			fprintf(stderr, "%s\t-> %s\n", p->policy_name, p->policy_description);
 			fprintf(stderr, "%s\t-> %s\n", p->policy_name, p->policy_description);
 		}
 		}
 	 }
 	 }
 }
 }
 
 
-static struct starpu_sched_policy_s *select_sched_policy(struct starpu_machine_config_s *config, const char *policy_name)
+static struct starpu_sched_policy *select_sched_policy(struct _starpu_machine_config *config)
 {
 {
-	struct starpu_sched_policy_s *selected_policy = NULL;
+	struct starpu_sched_policy *selected_policy = NULL;
 	struct starpu_conf *user_conf = config->user_conf;
 	struct starpu_conf *user_conf = config->user_conf;
 
 
 	/* First, we check whether the application explicitely gave a scheduling policy or not */
 	/* First, we check whether the application explicitely gave a scheduling policy or not */
@@ -152,19 +154,12 @@ static struct starpu_sched_policy_s *select_sched_policy(struct starpu_machine_c
 
 
 	/* Otherwise, we look if the application specified the name of a policy to load */
 	/* Otherwise, we look if the application specified the name of a policy to load */
 	const char *sched_pol_name;
 	const char *sched_pol_name;
-	if (user_conf && (user_conf->sched_policy_name))
-	{
+	sched_pol_name = getenv("STARPU_SCHED");
+	if (sched_pol_name == NULL && user_conf && user_conf->sched_policy_name)
 		sched_pol_name = user_conf->sched_policy_name;
 		sched_pol_name = user_conf->sched_policy_name;
-	}
-	else {
-		sched_pol_name = getenv("STARPU_SCHED");
-	}
 
 
 	if (sched_pol_name)
 	if (sched_pol_name)
 		selected_policy = find_sched_policy_from_name(sched_pol_name);
 		selected_policy = find_sched_policy_from_name(sched_pol_name);
-	else
-		if(policy_name)
-			selected_policy = find_sched_policy_from_name(policy_name);
 
 
 	/* Perhaps there was no policy that matched the name */
 	/* Perhaps there was no policy that matched the name */
 	if (selected_policy)
 	if (selected_policy)
@@ -175,7 +170,7 @@ static struct starpu_sched_policy_s *select_sched_policy(struct starpu_machine_c
 	return &heft_policy;
 	return &heft_policy;
 }
 }
 
 
-void _starpu_init_sched_policy(struct starpu_machine_config_s *config, struct starpu_sched_ctx *sched_ctx, const char *policy_name)
+void _starpu_init_sched_policy(struct starpu_machine_config *config, struct starpu_sched_ctx *sched_ctx)
 {
 {
 	/* Perhaps we have to display some help */
 	/* Perhaps we have to display some help */
 	display_sched_help_message();
 	display_sched_help_message();
@@ -187,19 +182,16 @@ void _starpu_init_sched_policy(struct starpu_machine_config_s *config, struct st
 
 
 	/* By default, we don't calibrate */
 	/* By default, we don't calibrate */
 	unsigned do_calibrate = 0;
 	unsigned do_calibrate = 0;
-	if (config->user_conf && (config->user_conf->calibrate != -1))
-	{
-		do_calibrate = config->user_conf->calibrate;
-	}
-	else {
-		int res = starpu_get_env_number("STARPU_CALIBRATE");
-		do_calibrate =  (res < 0)?0:(unsigned)res;
-	}
+	int res = starpu_get_env_number("STARPU_CALIBRATE");
+	if (res == -1 && config->user_conf)
+		res = config->user_conf->calibrate;
+
+	do_calibrate = (res < 0)?0:(unsigned)res;
 
 
 	_starpu_set_calibrate_flag(do_calibrate);
 	_starpu_set_calibrate_flag(do_calibrate);
 
 
-	struct starpu_sched_policy_s *selected_policy;
-	selected_policy = select_sched_policy(config, policy_name);
+	struct starpu_sched_policy *selected_policy;
+	selected_policy = select_sched_policy(config);
 
 
 	load_sched_policy(selected_policy, sched_ctx);
 	load_sched_policy(selected_policy, sched_ctx);
 
 
@@ -208,7 +200,7 @@ void _starpu_init_sched_policy(struct starpu_machine_config_s *config, struct st
 
 
 void _starpu_deinit_sched_policy(struct starpu_sched_ctx *sched_ctx)
 void _starpu_deinit_sched_policy(struct starpu_sched_ctx *sched_ctx)
 {
 {
-        struct starpu_sched_policy_s *policy = sched_ctx->sched_policy;
+	struct starpu_sched_policy_s *policy = sched_ctx->sched_policy;
 	if (policy->deinit_sched)
 	if (policy->deinit_sched)
 		policy->deinit_sched(sched_ctx->id);
 		policy->deinit_sched(sched_ctx->id);
 }
 }
@@ -224,9 +216,9 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 	/* Is this a basic worker or a combined worker ? */
 	/* Is this a basic worker or a combined worker ? */
 	int is_basic_worker = (workerid < nbasic_workers);
 	int is_basic_worker = (workerid < nbasic_workers);
 
 
-	unsigned memory_node; 
-	struct starpu_worker_s *worker = NULL;
-	struct starpu_combined_worker_s *combined_worker = NULL;
+	unsigned memory_node;
+	struct _starpu_worker *worker = NULL;
+	struct _starpu_combined_worker *combined_worker = NULL;
 
 
 	if (is_basic_worker)
 	if (is_basic_worker)
 	{
 	{
@@ -254,9 +246,34 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 	
 	
 	if (is_basic_worker)
 	if (is_basic_worker)
 	{
 	{
+		unsigned node = starpu_worker_get_memory_node(workerid);
+		if (_starpu_task_uses_multiformat_handles(task))
+		{
+			unsigned i;
+			for (i = 0; i < task->cl->nbuffers; i++)
+			{
+				struct starpu_task *conversion_task;
+				starpu_data_handle_t handle;
+
+				handle = task->handles[i];
+				if (!_starpu_handle_needs_conversion_task(handle, node))
+					continue;
+
+				conversion_task = _starpu_create_conversion_task(handle, node);
+				conversion_task->mf_skip = 1;
+				conversion_task->execute_on_a_specific_worker = 1;
+				conversion_task->workerid = workerid;
+				_starpu_task_submit_conversion_task(conversion_task, workerid);
+				//_STARPU_DEBUG("Pushing a conversion task\n");
+			}
+
+			for (i = 0; i < task->cl->nbuffers; i++)
+				task->handles[i]->mf_node = node;
+		}
 		return _starpu_push_local_task(worker, task, 0);
 		return _starpu_push_local_task(worker, task, 0);
 	}
 	}
-	else {
+	else
+	{
 		/* This is a combined worker so we create task aliases */
 		/* This is a combined worker so we create task aliases */
 		int worker_size = combined_worker->worker_size;
 		int worker_size = combined_worker->worker_size;
 		int *combined_workerid = combined_worker->combined_workerid;
 		int *combined_workerid = combined_worker->combined_workerid;
@@ -264,13 +281,13 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		int ret = 0;
 		int ret = 0;
 		int i;
 		int i;
 
 
-		starpu_job_t j = _starpu_get_job_associated_to_task(task);
+		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 		j->task_size = worker_size;
 		j->task_size = worker_size;
 		j->combined_workerid = workerid;
 		j->combined_workerid = workerid;
 		j->active_task_alias_count = 0;
 		j->active_task_alias_count = 0;
 
 
-		PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
-		PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
+		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
+		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
 
 
 		for (i = 0; i < worker_size; i++)
 		for (i = 0; i < worker_size; i++)
 		{
 		{
@@ -304,7 +321,7 @@ static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struc
 }
 }
 
 
 /* the generic interface that call the proper underlying implementation */
 /* the generic interface that call the proper underlying implementation */
-int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
+int _starpu_push_task(struct _starpu_job *j)
 {
 {
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
@@ -340,6 +357,7 @@ int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
 
 
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 
 
+	_starpu_increment_nready_tasks();
 	task->status = STARPU_TASK_READY;
 	task->status = STARPU_TASK_READY;
 	_starpu_profiling_set_task_push_start_time(task);
 	_starpu_profiling_set_task_push_start_time(task);
 
 
@@ -348,7 +366,7 @@ int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
 	 * corresponding dependencies */
 	 * corresponding dependencies */
 	if (task->cl == NULL)
 	if (task->cl == NULL)
 	{
 	{
-		_starpu_handle_job_termination(j, job_is_already_locked, -1);
+		_starpu_handle_job_termination(j, -1);
                 _STARPU_LOG_OUT_TAG("handle_job_termination");
                 _STARPU_LOG_OUT_TAG("handle_job_termination");
 		return 0;
 		return 0;
 	}
 	}
@@ -358,7 +376,7 @@ int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
 	{
 	{
 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
 	}
 	}
-	else 
+	else
 	{
 	{
 		STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 		STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 
 
@@ -366,7 +384,7 @@ int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
 		if(ret == -1)
 		if(ret == -1)
 		{
 		{
 			printf("repush task \n");
 			printf("repush task \n");
-			ret = _starpu_push_task(j, job_is_already_locked);
+			ret = _starpu_push_task(j);
 		}
 		}
 	}
 	}
 
 
@@ -376,21 +394,88 @@ int _starpu_push_task(starpu_job_t j, unsigned job_is_already_locked)
         return ret;
         return ret;
 }
 }
 
 
-struct starpu_task *_starpu_pop_task(struct starpu_worker_s *worker)
+/*
+ * Given a handle that needs to be converted in order to be used on the given
+ * node, returns a task that takes care of the conversion.
+ */
+struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
+						   unsigned int node)
+{
+	struct starpu_task *conversion_task;
+	struct starpu_multiformat_interface *format_interface;
+	enum starpu_node_kind node_kind;
+
+	conversion_task = starpu_task_create();
+	conversion_task->synchronous = 0;
+	conversion_task->handles[0] = handle;
+
+	/* The node does not really matter here */
+	format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
+	node_kind = starpu_node_get_kind(node);
+
+	handle->refcnt++;
+	handle->busy_count++;
+
+	struct starpu_multiformat_data_interface_ops *mf_ops;
+	mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+	switch(node_kind)
+	{
+	case STARPU_CPU_RAM:
+		switch (starpu_node_get_kind(handle->mf_node))
+		{
+		case STARPU_CPU_RAM:
+			STARPU_ASSERT(0);
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+			conversion_task->cl = mf_ops->cuda_to_cpu_cl;
+			break;
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			conversion_task->cl = mf_ops->opencl_to_cpu_cl;
+			break;
+#endif
+		default:
+			fprintf(stderr, "Oops : %u\n", handle->mf_node);
+			STARPU_ASSERT(0);
+		}
+		break;
+#ifdef STARPU_USE_CUDA
+	case STARPU_CUDA_RAM:
+		conversion_task->cl = mf_ops->cpu_to_cuda_cl;
+		break;
+#endif
+#ifdef STARPU_USE_OPENCL
+	case STARPU_OPENCL_RAM:
+		conversion_task->cl = mf_ops->cpu_to_opencl_cl;
+		break;
+#endif
+	case STARPU_SPU_LS: /* Not supported */
+	default:
+		STARPU_ASSERT(0);
+	}
+
+	conversion_task->cl->modes[0] = STARPU_RW;
+	return conversion_task;
+}
+
+struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker)
 {
 {
 	struct starpu_task *task;
 	struct starpu_task *task;
+	int worker_id;
+	unsigned node;
 
 
 	/* We can't tell in advance which task will be picked up, so we measure
 	/* We can't tell in advance which task will be picked up, so we measure
 	 * a timestamp, and will attribute it afterwards to the task. */
 	 * a timestamp, and will attribute it afterwards to the task. */
 	int profiling = starpu_profiling_status_get();
 	int profiling = starpu_profiling_status_get();
 	struct timespec pop_start_time;
 	struct timespec pop_start_time;
 	if (profiling)
 	if (profiling)
-		starpu_clock_gettime(&pop_start_time);
-	
-	PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+		_starpu_clock_gettime(&pop_start_time);
+pick:
+	_STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 	/* perhaps there is some local task to be executed first */
 	/* perhaps there is some local task to be executed first */
 	task = _starpu_pop_local_task(worker);
 	task = _starpu_pop_local_task(worker);
-	PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 	
 	
 	
 	
 	/* get tasks from the stacks of the strategy */
 	/* get tasks from the stacks of the strategy */
@@ -409,21 +494,68 @@ struct starpu_task *_starpu_pop_task(struct starpu_worker_s *worker)
 				sched_ctx_mutex = _starpu_get_sched_mutex(sched_ctx, worker->workerid);
 				sched_ctx_mutex = _starpu_get_sched_mutex(sched_ctx, worker->workerid);
 				if(sched_ctx_mutex != NULL)
 				if(sched_ctx_mutex != NULL)
 				{
 				{
-					PTHREAD_MUTEX_LOCK(sched_ctx_mutex);
+					_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx_mutex);
 					if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
 					if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
 					{
 					{
 						task = sched_ctx->sched_policy->pop_task();
 						task = sched_ctx->sched_policy->pop_task();
-						PTHREAD_MUTEX_UNLOCK(sched_ctx_mutex);
+						_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx_mutex);
 						break;
 						break;
 					}
 					}
-					PTHREAD_MUTEX_UNLOCK(sched_ctx_mutex);
+					_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx_mutex);
 				}
 				}
 			}
 			}
 		}
 		}
 	  }
 	  }
 
 
-	/* Note that we may get a NULL task in case the scheduler was unlocked
-	 * for some reason. */
+	if (!task)
+		goto profiling;
+
+	/* Make sure we do not bother with all the multiformat-specific code if 
+	 * it is not necessary. */
+	if (!_starpu_task_uses_multiformat_handles(task))
+		goto profiling;
+
+
+	/* This is either a conversion task, or a regular task for which the
+	 * conversion tasks have already been created and submitted */
+	if (task->mf_skip)
+		goto profiling;
+
+	worker_id = starpu_worker_get_id();
+	if (!starpu_worker_can_execute_task(worker_id, task, 0))
+		return task;
+
+	node = starpu_worker_get_memory_node(worker_id);
+
+	/*
+	 * We do have a task that uses multiformat handles. Let's create the 
+	 * required conversion tasks.
+	 */
+	unsigned i;
+	for (i = 0; i < task->cl->nbuffers; i++)
+	{
+		struct starpu_task *conversion_task;
+		starpu_data_handle_t handle;
+
+		handle = task->handles[i];
+		if (!_starpu_handle_needs_conversion_task(handle, node))
+			continue;
+		conversion_task = _starpu_create_conversion_task(handle, node);
+		conversion_task->mf_skip = 1;
+		conversion_task->execute_on_a_specific_worker = 1;
+		conversion_task->workerid = worker_id;
+		/*
+		 * Next tasks will need to know where these handles have gone.
+		 */
+		handle->mf_node = node;
+		_starpu_task_submit_conversion_task(conversion_task, worker_id);
+	}
+
+	task->mf_skip = 1;
+	starpu_task_list_push_front(&worker->local_tasks, task);
+	goto pick;
+
+profiling:
 	if (profiling && task)
 	if (profiling && task)
 	{
 	{
 		struct starpu_task_profiling_info *profiling_info;
 		struct starpu_task_profiling_info *profiling_info;
@@ -436,7 +568,7 @@ struct starpu_task *_starpu_pop_task(struct starpu_worker_s *worker)
 		{
 		{
 			memcpy(&profiling_info->pop_start_time,
 			memcpy(&profiling_info->pop_start_time,
 				&pop_start_time, sizeof(struct timespec));
 				&pop_start_time, sizeof(struct timespec));
-			starpu_clock_gettime(&profiling_info->pop_end_time);
+			_starpu_clock_gettime(&profiling_info->pop_end_time);
 		}
 		}
 	}
 	}
 
 
@@ -473,6 +605,13 @@ struct starpu_task *_starpu_pop_every_task(struct starpu_sched_ctx *sched_ctx)
 	return sched_ctx->sched_policy->pop_every_task();
 	return sched_ctx->sched_policy->pop_every_task();
 }
 }
 
 
+void _starpu_sched_pre_exec_hook(struct starpu_task *task)
+{
+	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+	if (sched_ctx->sched_policy->pre_exec_hook)
+		sched_ctx->sched_policy->pre_exec_hook(task);
+}
+
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
 {
 {
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
@@ -489,20 +628,21 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 
 
 void _starpu_wait_on_sched_event(void)
 void _starpu_wait_on_sched_event(void)
 {
 {
- 	struct starpu_worker_s *worker = _starpu_get_local_worker_key();
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
 
-	PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(worker->sched_mutex);
 
 
 	_starpu_handle_all_pending_node_data_requests(worker->memory_node);
 	_starpu_handle_all_pending_node_data_requests(worker->memory_node);
 
 
 	if (_starpu_machine_is_running())
 	if (_starpu_machine_is_running())
 	{
 	{
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 #ifndef STARPU_NON_BLOCKING_DRIVERS
-		pthread_cond_wait(&worker->sched_cond, &worker->sched_mutex);
+		_STARPU_PTHREAD_COND_WAIT(worker->sched_cond,
+					  worker->sched_mutex);
 #endif
 #endif
 	}
 	}
 
 
-	PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(worker->sched_mutex);
 }
 }
 
 
 /* The scheduling policy may put tasks directly into a worker's local queue so
 /* The scheduling policy may put tasks directly into a worker's local queue so
@@ -512,9 +652,8 @@ void _starpu_wait_on_sched_event(void)
  * a FIFO ordering. */
  * a FIFO ordering. */
 int starpu_push_local_task(int workerid, struct starpu_task *task, int back)
 int starpu_push_local_task(int workerid, struct starpu_task *task, int back)
 {
 {
-	struct starpu_worker_s *worker = _starpu_get_worker_struct(workerid);
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 
 
 	return _starpu_push_local_task(worker, task, back);
 	return _starpu_push_local_task(worker, task, back);
 }
 }
 
 
-

+ 12 - 7
src/core/sched_policy.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,21 +23,26 @@
 #include <core/sched_ctx.h>
 #include <core/sched_ctx.h>
 #include <starpu_scheduler.h>
 #include <starpu_scheduler.h>
 
 
-struct starpu_machine_config_s;
-struct starpu_sched_policy_s *_starpu_get_sched_policy( struct starpu_sched_ctx *sched_ctx);
+struct starpu_machine_config;
+struct starpu_sched_policy *_starpu_get_sched_policy( struct starpu_sched_ctx *sched_ctx);
 
 
-void _starpu_init_sched_policy(struct starpu_machine_config_s *config, 
-			       struct starpu_sched_ctx *sched_ctx, const char *policy_name);
+void _starpu_init_sched_policy(struct starpu_machine_config *config, 
+			       struct starpu_sched_ctx *sched_ctx);
 
 
 void _starpu_deinit_sched_policy(struct starpu_sched_ctx *sched_ctx);
 void _starpu_deinit_sched_policy(struct starpu_sched_ctx *sched_ctx);
 
 
-int _starpu_push_task(starpu_job_t task, unsigned job_is_already_locked);
+int _starpu_push_task(struct _starpu_job *task);
 /* pop a task that can be executed on the worker */
 /* pop a task that can be executed on the worker */
-struct starpu_task *_starpu_pop_task(struct starpu_worker_s *worker);
+struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker);
 /* pop every task that can be executed on the worker */
 /* pop every task that can be executed on the worker */
 struct starpu_task *_starpu_pop_every_task(struct starpu_sched_ctx *sched_ctx);
 struct starpu_task *_starpu_pop_every_task(struct starpu_sched_ctx *sched_ctx);
 void _starpu_sched_post_exec_hook(struct starpu_task *task);
 void _starpu_sched_post_exec_hook(struct starpu_task *task);
 
 
 void _starpu_wait_on_sched_event(void);
 void _starpu_wait_on_sched_event(void);
 
 
+struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
+						   unsigned int node);
+
+void _starpu_sched_pre_exec_hook(struct starpu_task *task);
+
 #endif // __SCHED_POLICY_H__
 #endif // __SCHED_POLICY_H__

+ 426 - 78
src/core/task.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
@@ -19,22 +19,24 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_profiling.h>
 #include <starpu_profiling.h>
-#include <starpu_task_bundle.h>
 #include <core/workers.h>
 #include <core/workers.h>
 #include <core/sched_ctx.h>
 #include <core/sched_ctx.h>
 #include <core/jobs.h>
 #include <core/jobs.h>
 #include <core/task.h>
 #include <core/task.h>
+#include <core/task_bundle.h>
 #include <common/config.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <common/utils.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <profiling/bound.h>
+#include <math.h>
+#include <string.h>
 
 
 /* XXX this should be reinitialized when StarPU is shutdown (or we should make
 /* XXX this should be reinitialized when StarPU is shutdown (or we should make
  * sure that no task remains !) */
  * sure that no task remains !) */
 /* TODO we could make this hierarchical to avoid contention ? */
 /* TODO we could make this hierarchical to avoid contention ? */
 static pthread_cond_t submitted_cond = PTHREAD_COND_INITIALIZER;
 static pthread_cond_t submitted_cond = PTHREAD_COND_INITIALIZER;
 static pthread_mutex_t submitted_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t submitted_mutex = PTHREAD_MUTEX_INITIALIZER;
-static long int nsubmitted = 0;
+static long int nsubmitted = 0, nready = 0;
 
 
 static void _starpu_increment_nsubmitted_tasks(void);
 static void _starpu_increment_nsubmitted_tasks(void);
 
 
@@ -76,10 +78,11 @@ void starpu_task_init(struct starpu_task *task)
 
 
 	task->profiling_info = NULL;
 	task->profiling_info = NULL;
 
 
-	task->predicted = -1.0;
+	task->predicted = NAN;
+	task->predicted_transfer = NAN;
 
 
 	task->starpu_private = NULL;
 	task->starpu_private = NULL;
-
+	task->magic = 42;
 	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
 	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
 	
 	
 	task->control_task = 0;
 	task->control_task = 0;
@@ -103,19 +106,11 @@ void starpu_task_deinit(struct starpu_task *task)
 	}
 	}
 
 
 	/* If case the task is (still) part of a bundle */
 	/* If case the task is (still) part of a bundle */
-	struct starpu_task_bundle *bundle = task->bundle;
+	starpu_task_bundle_t bundle = task->bundle;
 	if (bundle)
 	if (bundle)
-	{
-		PTHREAD_MUTEX_LOCK(&bundle->mutex);
-		int ret = starpu_task_bundle_remove(bundle, task);
-
-		/* Perhaps the bundle was destroyed when removing the last
-		 * entry */
-		if (ret != 1)
-			PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
-	}
+		starpu_task_bundle_remove(bundle, task);
 
 
-	starpu_job_t j = (struct starpu_job_s *)task->starpu_private;
+	struct _starpu_job *j = (struct _starpu_job *)task->starpu_private;
 
 
 	if (j)
 	if (j)
 		_starpu_job_destroy(j);
 		_starpu_job_destroy(j);
@@ -140,100 +135,220 @@ struct starpu_task * __attribute__((malloc)) starpu_task_create(void)
  * called automatically after the execution of a task by setting the "destroy"
  * called automatically after the execution of a task by setting the "destroy"
  * flag of the starpu_task structure (default behaviour). Calling this function
  * flag of the starpu_task structure (default behaviour). Calling this function
  * on a statically allocated task results in an undefined behaviour. */
  * on a statically allocated task results in an undefined behaviour. */
-void starpu_task_destroy(struct starpu_task *task)
+void _starpu_task_destroy(struct starpu_task *task)
 {
 {
-	STARPU_ASSERT(task);
 
 
    /* If starpu_task_destroy is called in a callback, we just set the destroy
    /* If starpu_task_destroy is called in a callback, we just set the destroy
       flag. The task will be destroyed after the callback returns */
       flag. The task will be destroyed after the callback returns */
-   if (task == starpu_get_current_task()
-       && _starpu_get_local_worker_status() == STATUS_CALLBACK) {
+   if (task == starpu_task_get_current()
+       && _starpu_get_local_worker_status() == STATUS_CALLBACK)
+   {
 
 
-      task->destroy = 1;
+	   task->destroy = 1;
 
 
-   } else {
-
-      starpu_task_deinit(task);
-
-      /* TODO handle the case of task with detach = 1 and destroy = 1 */
-      /* TODO handle the case of non terminated tasks -> return -EINVAL */
-	
-      free(task);
+   }
+   else
+   {
+	   starpu_task_deinit(task);
+	   /* TODO handle the case of task with detach = 1 and destroy = 1 */
+	   /* TODO handle the case of non terminated tasks -> return -EINVAL */
+	   free(task);
    }
    }
 }
 }
 
 
+void starpu_task_destroy(struct starpu_task *task)
+{
+	STARPU_ASSERT(task);
+	STARPU_ASSERT_MSG(!task->destroy || !task->detach, "starpu_task_destroy must not be called for task with destroy = 1 and detach = 1");
+	_starpu_task_destroy(task);
+}
+
 int starpu_task_wait(struct starpu_task *task)
 int starpu_task_wait(struct starpu_task *task)
 {
 {
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 	STARPU_ASSERT(task);
 	STARPU_ASSERT(task);
 
 
-	if (task->detach || task->synchronous) {
+	STARPU_ASSERT_MSG(!task->detach, "starpu_task_wait can only be called on tasks with detach = 0");
+
+	if (task->detach || task->synchronous)
+	{
 		_STARPU_DEBUG("Task is detached or asynchronous. Waiting returns immediately\n");
 		_STARPU_DEBUG("Task is detached or asynchronous. Waiting returns immediately\n");
 		_STARPU_LOG_OUT_TAG("einval");
 		_STARPU_LOG_OUT_TAG("einval");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) {
+	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+	{
 		_STARPU_LOG_OUT_TAG("edeadlk");
 		_STARPU_LOG_OUT_TAG("edeadlk");
 		return -EDEADLK;
 		return -EDEADLK;
 	}
 	}
 
 
-	starpu_job_t j = (struct starpu_job_s *)task->starpu_private;
+	struct _starpu_job *j = (struct _starpu_job *)task->starpu_private;
 
 
 	_starpu_wait_job(j);
 	_starpu_wait_job(j);
 
 
 	/* as this is a synchronous task, the liberation of the job
 	/* as this is a synchronous task, the liberation of the job
 	   structure was deferred */
 	   structure was deferred */
 	if (task->destroy)
 	if (task->destroy)
-		free(task);
+		_starpu_task_destroy(task);
 
 
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
 	return 0;
 	return 0;
 }
 }
 
 
-starpu_job_t _starpu_get_job_associated_to_task(struct starpu_task *task)
+struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task)
 {
 {
 	STARPU_ASSERT(task);
 	STARPU_ASSERT(task);
 
 
 	if (!task->starpu_private)
 	if (!task->starpu_private)
 	{
 	{
-		starpu_job_t j = _starpu_job_create(task);
+		struct _starpu_job *j = _starpu_job_create(task);
 		task->starpu_private = j;
 		task->starpu_private = j;
 	}
 	}
 
 
-	return (struct starpu_job_s *)task->starpu_private;
+	return (struct _starpu_job *)task->starpu_private;
 }
 }
 
 
 /* NB in case we have a regenerable task, it is possible that the job was
 /* NB in case we have a regenerable task, it is possible that the job was
  * already counted. */
  * already counted. */
-int _starpu_submit_job(starpu_job_t j, unsigned do_not_increment_nsubmitted)
+int _starpu_submit_job(struct _starpu_job *j)
 {
 {
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();
 	/* notify bound computation of a new task */
 	/* notify bound computation of a new task */
 	_starpu_bound_record(j);
 	_starpu_bound_record(j);
 
 
-	j->terminated = 0;
+	_starpu_increment_nsubmitted_tasks();
+	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 
 
-	if (!do_not_increment_nsubmitted){
-		_starpu_increment_nsubmitted_tasks();
-		_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
-	}
+	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 
-	PTHREAD_MUTEX_LOCK(&j->sync_mutex);
-	
+	/* Need to atomically set submitted to 1 and check dependencies, since
+	 * this is concucrent with _starpu_notify_cg */
+	j->terminated = 0;
 	j->submitted = 1;
 	j->submitted = 1;
-       
-	int ret = _starpu_enforce_deps_and_schedule(j, 1);
 
 
+	int ret = _starpu_enforce_deps_and_schedule(j);
 	PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 	PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
 
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
         return ret;
         return ret;
 }
 }
 
 
+void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
+{
+	if (!cl)
+		return;
+
+	int is_where_unset = cl->where == 0;
+
+	/* Check deprecated and unset fields (where, <device>_func,
+ 	 * <device>_funcs) */
+
+	/* CPU */
+	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS && cl->cpu_funcs[0])
+	{
+		fprintf(stderr, "[warning] [struct starpu_codelet] both cpu_func and cpu_funcs are set. Ignoring cpu_func.\n");
+		cl->cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS;
+	}
+	if (cl->cpu_func && cl->cpu_func != STARPU_MULTIPLE_CPU_IMPLEMENTATIONS)
+	{
+		cl->cpu_funcs[0] = cl->cpu_func;
+		cl->cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS;
+	}
+	if (cl->cpu_funcs[0] && cl->cpu_func == 0)
+	{
+		cl->cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS;
+	}
+	if (cl->cpu_funcs[0] && is_where_unset)
+	{
+		cl->where |= STARPU_CPU;
+	}
+
+	/* CUDA */
+	if (cl->cuda_func && cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS && cl->cuda_funcs[0])
+	{
+		fprintf(stderr, "[warning] [struct starpu_codelet] both cuda_func and cuda_funcs are set. Ignoring cuda_func.\n");
+		cl->cuda_func = STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS;
+	}
+	if (cl->cuda_func && cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS)
+	{
+		cl->cuda_funcs[0] = cl->cuda_func;
+		cl->cuda_func = STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS;
+	}
+	if (cl->cuda_funcs[0] && cl->cuda_func == 0)
+	{
+		cl->cuda_func = STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS;
+	}
+	if (cl->cuda_funcs[0] && is_where_unset)
+	{
+		cl->where |= STARPU_CUDA;
+	}
+
+	/* OpenCL */
+	if (cl->opencl_func && cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS && cl->opencl_funcs[0])
+	{
+		fprintf(stderr, "[warning] [struct starpu_codelet] both opencl_func and opencl_funcs are set. Ignoring opencl_func.\n");
+		cl->opencl_func = STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS;
+	}
+	if (cl->opencl_func && cl->opencl_func != STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS)
+	{
+		cl->opencl_funcs[0] = cl->opencl_func;
+		cl->opencl_func = STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS;
+	}
+	if (cl->opencl_funcs[0] && cl->opencl_func == 0)
+	{
+		cl->opencl_func = STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS;
+	}
+	if (cl->opencl_funcs[0] && is_where_unset)
+	{
+		cl->where |= STARPU_OPENCL;
+	}
+
+	/* Gordon */
+	if (cl->gordon_func && cl->gordon_func != STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS)
+	{
+		cl->gordon_funcs[0] = cl->gordon_func;
+		cl->gordon_func = STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS;
+	}
+	if (cl->gordon_funcs[0] && cl->gordon_func == 0)
+	{
+		cl->gordon_func = STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS;
+	}
+	if (cl->gordon_funcs[0] && is_where_unset)
+	{
+		cl->where = STARPU_GORDON;
+	}
+}
+
+void _starpu_task_check_deprecated_fields(struct starpu_task *task)
+{
+	if (task->cl)
+	{
+		unsigned i;
+		for(i=0; i<task->cl->nbuffers ; i++)
+		{
+			if (task->buffers[i].handle && task->handles[i])
+			{
+				fprintf(stderr, "[warning][struct starpu_task] task->buffers[%u] and task->handles[%u] both set. Ignoring task->buffers[%u] ?\n", i, i, i);
+				STARPU_ASSERT(task->buffers[i].mode == task->cl->modes[i]);
+				STARPU_ABORT();
+			}
+			if (task->buffers[i].handle)
+			{
+				task->handles[i] = task->buffers[i].handle;
+				task->cl->modes[i] = task->buffers[i].mode;
+			}
+			task->buffers[i].handle = NULL;
+			task->buffers[i].mode = STARPU_NONE;
+		}
+	}
+}
+
 /* application should submit new tasks to StarPU through this function */
 /* application should submit new tasks to StarPU through this function */
 int starpu_task_submit(struct starpu_task *task)
 int starpu_task_submit(struct starpu_task *task)
 {
 {
+	STARPU_ASSERT(task);
+	STARPU_ASSERT(task->magic == 42);
 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
 
 
 	task->sched_ctx = (nsched_ctxs == 1 || task->control_task) ? 
 	task->sched_ctx = (nsched_ctxs == 1 || task->control_task) ? 
@@ -246,7 +361,8 @@ int starpu_task_submit(struct starpu_task *task)
 	{
 	{
 		/* Perhaps it is not possible to submit a synchronous
 		/* Perhaps it is not possible to submit a synchronous
 		 * (blocking) task */
 		 * (blocking) task */
-                if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) {
+                if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+		{
                         _STARPU_LOG_OUT_TAG("EDEADLK");
                         _STARPU_LOG_OUT_TAG("EDEADLK");
 			return -EDEADLK;
 			return -EDEADLK;
                 }
                 }
@@ -254,26 +370,33 @@ int starpu_task_submit(struct starpu_task *task)
 		task->detach = 0;
 		task->detach = 0;
 	}
 	}
 
 
-	STARPU_ASSERT(task);
+	_starpu_task_check_deprecated_fields(task);
+	_starpu_codelet_check_deprecated_fields(task->cl);
 
 
 	if (task->cl)
 	if (task->cl)
 	{
 	{
-		uint32_t where = task->cl->where;
 		unsigned i;
 		unsigned i;
-		if (!_starpu_worker_exists(where)) {
+
+		/* Check the type of worker(s) required by the task exist */
+		if (!_starpu_worker_exists(task))
+		{
                         _STARPU_LOG_OUT_TAG("ENODEV");
                         _STARPU_LOG_OUT_TAG("ENODEV");
 			return -ENODEV;
 			return -ENODEV;
                 }
                 }
-		assert(task->cl->nbuffers <= STARPU_NMAXBUFS);
-		for (i = 0; i < task->cl->nbuffers; i++) {
+
+		/* Check buffers */
+		STARPU_ASSERT(task->cl->nbuffers <= STARPU_NMAXBUFS);
+		for (i = 0; i < task->cl->nbuffers; i++)
+		{
 			/* Make sure handles are not partitioned */
 			/* Make sure handles are not partitioned */
-			assert(task->buffers[i].handle->nchildren == 0);
+			STARPU_ASSERT(task->handles[i]->nchildren == 0);
 		}
 		}
 
 
 		/* In case we require that a task should be explicitely
 		/* In case we require that a task should be explicitely
 		 * executed on a specific worker, we make sure that the worker
 		 * executed on a specific worker, we make sure that the worker
 		 * is able to execute this task.  */
 		 * is able to execute this task.  */
-		if (task->execute_on_a_specific_worker && !starpu_combined_worker_may_execute_task(task->workerid, task, 0)) {
+		if (task->execute_on_a_specific_worker && !starpu_combined_worker_can_execute_task(task->workerid, task, 0))
+		{
                         _STARPU_LOG_OUT_TAG("ENODEV");
                         _STARPU_LOG_OUT_TAG("ENODEV");
 			return -ENODEV;
 			return -ENODEV;
                 }
                 }
@@ -300,17 +423,21 @@ int starpu_task_submit(struct starpu_task *task)
 
 
 
 
 	if (profiling)
 	if (profiling)
-		starpu_clock_gettime(&info->submit_time);
+		_starpu_clock_gettime(&info->submit_time);
 
 
-	/* internally, StarPU manipulates a starpu_job_t which is a wrapper around a
+	/* internally, StarPU manipulates a struct _starpu_job * which is a wrapper around a
 	* task structure, it is possible that this job structure was already
 	* task structure, it is possible that this job structure was already
 	* allocated, for instance to enforce task depenencies. */
 	* allocated, for instance to enforce task depenencies. */
-	starpu_job_t j = _starpu_get_job_associated_to_task(task);
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 
 
-	ret = _starpu_submit_job(j, 0);
+	ret = _starpu_submit_job(j);
 
 
 	if (is_sync)
 	if (is_sync)
+	{
 		_starpu_wait_job(j);
 		_starpu_wait_job(j);
+		if (task->destroy)
+		     _starpu_task_destroy(task);
+	}
 
 
         _STARPU_LOG_OUT();
         _STARPU_LOG_OUT();
 	return ret;
 	return ret;
@@ -322,16 +449,115 @@ int _starpu_task_submit_internal(struct starpu_task *task)
 	return starpu_task_submit(task);
 	return starpu_task_submit(task);
 }
 }
 
 
-void starpu_display_codelet_stats(struct starpu_codelet_t *cl)
+/* The StarPU core can submit tasks directly to the scheduler or a worker,
+ * skipping dependencies completely (when it knows what it is doing).  */
+int _starpu_task_submit_nodeps(struct starpu_task *task)
+{
+	_starpu_task_check_deprecated_fields(task);
+	_starpu_codelet_check_deprecated_fields(task->cl);
+
+	if (task->cl)
+	{
+		if (task->cl->model)
+			_starpu_load_perfmodel(task->cl->model);
+
+		if (task->cl->power_model)
+			_starpu_load_perfmodel(task->cl->power_model);
+	}
+
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+	_starpu_increment_nsubmitted_tasks();
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+
+	j->submitted = 1;
+
+	if (task->cl)
+	{
+		/* This would be done by data dependencies checking */
+		unsigned i;
+		for (i=0 ; i<task->cl->nbuffers ; i++)
+		{
+			j->ordered_buffers[i].handle = j->task->handles[i];
+			j->ordered_buffers[i].mode = j->task->cl->modes[i];
+		}
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+
+	return _starpu_push_task(j);
+}
+
+/*
+ * worker->sched_mutex must be locked when calling this function.
+ */
+int _starpu_task_submit_conversion_task(struct starpu_task *task,
+					unsigned int workerid)
+{
+	STARPU_ASSERT(task->cl);
+	STARPU_ASSERT(task->execute_on_a_specific_worker);
+
+	_starpu_task_check_deprecated_fields(task);
+	_starpu_codelet_check_deprecated_fields(task->cl);
+
+	/* We should factorize that */
+	if (task->cl->model)
+		_starpu_load_perfmodel(task->cl->model);
+
+	if (task->cl->power_model)
+		_starpu_load_perfmodel(task->cl->power_model);
+
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
+	_starpu_increment_nsubmitted_tasks();
+	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+	j->submitted = 1;
+	_starpu_increment_nready_tasks();
+
+	unsigned i;
+	for (i=0 ; i<task->cl->nbuffers ; i++)
+	{
+		j->ordered_buffers[i].handle = j->task->handles[i];
+		j->ordered_buffers[i].mode = j->task->cl->modes[i];
+	}
+
+        _STARPU_LOG_IN();
+
+	task->status = STARPU_TASK_READY;
+	_starpu_profiling_set_task_push_start_time(task);
+
+	unsigned node = starpu_worker_get_memory_node(workerid);
+	if (starpu_get_prefetch_flag())
+		starpu_prefetch_task_input_on_node(task, node);
+
+	struct _starpu_worker *worker;
+	worker = _starpu_get_worker_struct(workerid);
+	starpu_task_list_push_front(&worker->local_tasks, task);
+
+	_starpu_profiling_set_task_push_end_time(task);
+
+        _STARPU_LOG_OUT();
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+
+	return 0;
+}
+
+void starpu_codelet_init(struct starpu_codelet *cl)
+{
+	memset(cl, 0, sizeof(struct starpu_codelet));
+}
+
+void starpu_display_codelet_stats(struct starpu_codelet *cl)
 {
 {
 	unsigned worker;
 	unsigned worker;
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
 
 
-	if (cl->model && cl->model->symbol)
+	if (cl->name)
+		fprintf(stderr, "Statistics for codelet %s\n", cl->name);
+	else if (cl->model && cl->model->symbol)
 		fprintf(stderr, "Statistics for codelet %s\n", cl->model->symbol);
 		fprintf(stderr, "Statistics for codelet %s\n", cl->model->symbol);
 
 
 	unsigned long total = 0;
 	unsigned long total = 0;
-	
+
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 		total += cl->per_worker_stats[worker];
 		total += cl->per_worker_stats[worker];
 
 
@@ -355,42 +581,72 @@ int starpu_task_wait_for_all(void)
 	unsigned sched_ctx = nsched_ctxs == 1 ? 0 : starpu_get_sched_ctx();
 	unsigned sched_ctx = nsched_ctxs == 1 ? 0 : starpu_get_sched_ctx();
 	starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
 	starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
 
 
-/* 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls())) */
-/* 		return -EDEADLK; */
+	return 0;
+}
 
 
-/* 	PTHREAD_MUTEX_LOCK(&submitted_mutex); */
+/*
+ * We wait until there is no ready task any more (i.e. StarPU will not be able
+ * to progress any more).
+ */
+int starpu_task_wait_for_no_ready(void)
+{
+	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+		return -EDEADLK;
 
 
-/* 	STARPU_TRACE_TASK_WAIT_FOR_ALL; */
+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+
+	_STARPU_TRACE_TASK_WAIT_FOR_ALL;
+
+	while (nready > 0)
+		_STARPU_PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 
 
-/* 	while (nsubmitted > 0) */
-/* 		PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex); */
-	
-/* 	PTHREAD_MUTEX_UNLOCK(&submitted_mutex); */
 	return 0;
 	return 0;
 }
 }
 
 
 void _starpu_decrement_nsubmitted_tasks(void)
 void _starpu_decrement_nsubmitted_tasks(void)
 {
 {
-	PTHREAD_MUTEX_LOCK(&submitted_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 
 
 	if (--nsubmitted == 0)
 	if (--nsubmitted == 0)
-		PTHREAD_COND_BROADCAST(&submitted_cond);
+		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 
 
-	STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
+	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
 
 
-	PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 
 
 }
 }
 
 
 static void _starpu_increment_nsubmitted_tasks(void)
 static void _starpu_increment_nsubmitted_tasks(void)
 {
 {
-	PTHREAD_MUTEX_LOCK(&submitted_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 
 
 	nsubmitted++;
 	nsubmitted++;
 
 
-	STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
+	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+}
+
+void _starpu_increment_nready_tasks(void)
+{
+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+
+	nready++;
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+}
+
+void _starpu_decrement_nready_tasks(void)
+{
+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+
+	if (--nready == 0)
+		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 
 
-	PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 }
 }
 
 
 void _starpu_initialize_current_task_key(void)
 void _starpu_initialize_current_task_key(void)
@@ -401,7 +657,7 @@ void _starpu_initialize_current_task_key(void)
 /* Return the task currently executed by the worker, or NULL if this is called
 /* Return the task currently executed by the worker, or NULL if this is called
  * either from a thread that is not a task or simply because there is no task
  * either from a thread that is not a task or simply because there is no task
  * being executed at the moment. */
  * being executed at the moment. */
-struct starpu_task *starpu_get_current_task(void)
+struct starpu_task *starpu_task_get_current(void)
 {
 {
 	return (struct starpu_task *) pthread_getspecific(current_task_key);
 	return (struct starpu_task *) pthread_getspecific(current_task_key);
 }
 }
@@ -410,3 +666,95 @@ void _starpu_set_current_task(struct starpu_task *task)
 {
 {
 	pthread_setspecific(current_task_key, task);
 	pthread_setspecific(current_task_key, task);
 }
 }
+
+/*
+ * Returns 0 if tasks does not use any multiformat handle, 1 otherwise.
+ */
+int
+_starpu_task_uses_multiformat_handles(struct starpu_task *task)
+{
+	unsigned i;
+	for (i = 0; i < task->cl->nbuffers; i++)
+	{
+		if (_starpu_data_is_multiformat_handle(task->handles[i]))
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Checks whether the given handle needs to be converted in order to be used on
+ * the node given as the second argument.
+ */
+int
+_starpu_handle_needs_conversion_task(starpu_data_handle_t handle,
+				     unsigned int node)
+{
+	enum starpu_node_kind node_kind;
+
+	node_kind = starpu_node_get_kind(node);
+
+	/*
+	 * Here, we assume that CUDA devices and OpenCL devices use the 
+	 * same data structure. A conversion is only needed when moving 
+	 * data from a CPU to a GPU, or the other way around.
+	 */
+	switch (node_kind)
+	{
+		case STARPU_CPU_RAM:
+			switch(starpu_node_get_kind(handle->mf_node))
+			{
+				case STARPU_CPU_RAM:
+					return 0;
+				case STARPU_CUDA_RAM:      /* Fall through */
+				case STARPU_OPENCL_RAM:
+					return 1;
+				case STARPU_SPU_LS: /* Not supported */
+				default:
+					STARPU_ASSERT(0);
+			}
+			break;
+		case STARPU_CUDA_RAM:    /* Fall through */
+		case STARPU_OPENCL_RAM:
+			switch(starpu_node_get_kind(handle->mf_node))
+			{
+				case STARPU_CPU_RAM:
+					return 1;
+				case STARPU_CUDA_RAM:
+				case STARPU_OPENCL_RAM:
+					return 0;
+				case STARPU_SPU_LS: /* Not supported */
+				default:
+					STARPU_ASSERT(0);
+			}
+			break;
+		case STARPU_SPU_LS:            /* Not supported */
+		default:
+			STARPU_ASSERT(0);
+	}
+}
+
+starpu_cpu_func_t _starpu_task_get_cpu_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	STARPU_ASSERT(cl->cpu_func == STARPU_MULTIPLE_CPU_IMPLEMENTATIONS);
+	return cl->cpu_funcs[nimpl];
+}
+
+starpu_cuda_func_t _starpu_task_get_cuda_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	STARPU_ASSERT(cl->cuda_func == STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS);
+	return cl->cuda_funcs[nimpl];
+}
+
+starpu_opencl_func_t _starpu_task_get_opencl_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	STARPU_ASSERT(cl->opencl_func == STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS);
+	return cl->opencl_funcs[nimpl];
+}
+
+starpu_gordon_func_t _starpu_task_get_gordon_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	STARPU_ASSERT(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS);
+	return cl->gordon_funcs[nimpl];
+}

+ 32 - 4
src/core/task.h

@@ -1,7 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011 INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,9 +23,16 @@
 #include <common/config.h>
 #include <common/config.h>
 #include <core/jobs.h>
 #include <core/jobs.h>
 
 
+/* Internal version of starpu_task_destroy: don't check task->destroy flag */
+void _starpu_task_destroy(struct starpu_task *task);
+
 /* In order to implement starpu_task_wait_for_all, we keep track of the number of
 /* In order to implement starpu_task_wait_for_all, we keep track of the number of
  * task currently submitted */
  * task currently submitted */
 void _starpu_decrement_nsubmitted_tasks(void);
 void _starpu_decrement_nsubmitted_tasks(void);
+/* In order to implement starpu_task_wait_for_no_ready, we keep track of the number of
+ * task currently ready */
+void _starpu_increment_nready_tasks(void);
+void _starpu_decrement_nready_tasks(void);
 
 
 /* A pthread key is used to store the task currently executed on the thread.
 /* A pthread key is used to store the task currently executed on the thread.
  * _starpu_initialize_current_task_key initializes this pthread key and
  * _starpu_initialize_current_task_key initializes this pthread key and
@@ -34,15 +42,35 @@ void _starpu_set_current_task(struct starpu_task *task);
 
 
 /* NB the second argument makes it possible to count regenerable tasks only
 /* NB the second argument makes it possible to count regenerable tasks only
  * once. */
  * once. */
-int _starpu_submit_job(starpu_job_t j, unsigned do_not_increment_nsubmitted);
+int _starpu_submit_job(struct _starpu_job *j);
+
+int _starpu_task_submit_nodeps(struct starpu_task *task);
+
+void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[], int check);
 
 
 /* Returns the job structure (which is the internal data structure associated
 /* Returns the job structure (which is the internal data structure associated
  * to a task). */
  * to a task). */
-starpu_job_t _starpu_get_job_associated_to_task(struct starpu_task *task);
+struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task);
 
 
 struct starpu_task *_starpu_create_task_alias(struct starpu_task *task);
 struct starpu_task *_starpu_create_task_alias(struct starpu_task *task);
 
 
 /* Submits starpu internal tasks to the initial context */
 /* Submits starpu internal tasks to the initial context */
 int _starpu_task_submit_internal(struct starpu_task *task);
 int _starpu_task_submit_internal(struct starpu_task *task);
 
 
+int _starpu_handle_needs_conversion_task(starpu_data_handle_t handle,
+					 unsigned int node);
+
+int _starpu_task_uses_multiformat_handles(struct starpu_task *task);
+
+int _starpu_task_submit_conversion_task(struct starpu_task *task,
+					unsigned int workerid);
+
+void _starpu_task_check_deprecated_fields(struct starpu_task *task);
+void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl);
+
+starpu_cpu_func_t _starpu_task_get_cpu_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+starpu_cuda_func_t _starpu_task_get_cuda_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+starpu_opencl_func_t _starpu_task_get_opencl_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+starpu_gordon_func_t _starpu_task_get_gordon_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+
 #endif // __CORE_TASK_H__
 #endif // __CORE_TASK_H__

+ 77 - 181
src/core/task_bundle.c

@@ -2,6 +2,7 @@
  *
  *
  * Copyright (C) 2011  Université de Bordeaux 1
  * Copyright (C) 2011  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2012  Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,69 +18,50 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_task_bundle.h>
 #include <starpu_task_bundle.h>
+#include <core/task_bundle.h>
 #include <starpu_scheduler.h>
 #include <starpu_scheduler.h>
 #include <common/config.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <common/utils.h>
 #include <common/list.h>
 #include <common/list.h>
 
 
 /* Initialize a task bundle */
 /* Initialize a task bundle */
-void starpu_task_bundle_init(struct starpu_task_bundle *bundle)
+void starpu_task_bundle_create(starpu_task_bundle_t *bundle)
 {
 {
-	STARPU_ASSERT(bundle);
+	*bundle = (starpu_task_bundle_t) malloc(sizeof(struct _starpu_task_bundle));
+	STARPU_ASSERT(*bundle);
 
 
-	PTHREAD_MUTEX_INIT(&bundle->mutex, NULL);
-	bundle->closed = 0;
+	_STARPU_PTHREAD_MUTEX_INIT(&(*bundle)->mutex, NULL);
+	/* Of course at the beginning a bundle is open,
+	 * user can insert and remove tasks from it */
+	(*bundle)->closed = 0;
 
 
 	/* Start with an empty list */
 	/* Start with an empty list */
-	bundle->previous_workerid = -1;
-	bundle->list = NULL;
+	(*bundle)->list = NULL;
 
 
-	/* By default, bundle are destroyed */
-	bundle->destroy = 1;
-
-}
-
-/* Deinitialize a bundle. In case the destroy flag is set, the bundle structure
- * is freed too. */
-void starpu_task_bundle_deinit(struct starpu_task_bundle *bundle)
-{
-	/* Remove all entries from the bundle (which is likely to be empty) */
-	while (bundle->list)
-	{
-		struct starpu_task_bundle_entry *entry = bundle->list;
-		bundle->list = bundle->list->next;
-		free(entry);
-	}
-
-	PTHREAD_MUTEX_DESTROY(&bundle->mutex);
-
-	if (bundle->destroy)
-		free(bundle);
 }
 }
 
 
-/* Insert a task into a bundle. */
-int starpu_task_bundle_insert(struct starpu_task_bundle *bundle, struct starpu_task *task)
+int starpu_task_bundle_insert(starpu_task_bundle_t bundle, struct starpu_task *task)
 {
 {
-	PTHREAD_MUTEX_LOCK(&bundle->mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&bundle->mutex);
 
 
 	if (bundle->closed)
 	if (bundle->closed)
 	{
 	{
-		/* The bundle is closed, we cannot add tasks anymore */
-		PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+		/* The bundle is closed, we cannot add task anymore */
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
 		return -EPERM;
 		return -EPERM;
 	}
 	}
 
 
 	if (task->status != STARPU_TASK_INVALID)
 	if (task->status != STARPU_TASK_INVALID)
 	{
 	{
-		/* the task has already been submitted, it's too late to put it
+		/* The task has already been submitted, it's too late to put it
 		 * into a bundle now. */
 		 * into a bundle now. */
-		PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
 	/* Insert a task at the end of the bundle */
 	/* Insert a task at the end of the bundle */
-	struct starpu_task_bundle_entry *entry;
-	entry = (struct starpu_task_bundle_entry *) malloc(sizeof(struct starpu_task_bundle_entry));
+	struct _starpu_task_bundle_entry *entry;
+	entry = (struct _starpu_task_bundle_entry *) malloc(sizeof(struct _starpu_task_bundle_entry));
 	STARPU_ASSERT(entry);
 	STARPU_ASSERT(entry);
 	entry->task = task;
 	entry->task = task;
 	entry->next = NULL;
 	entry->next = NULL;
@@ -88,8 +70,9 @@ int starpu_task_bundle_insert(struct starpu_task_bundle *bundle, struct starpu_t
 	{
 	{
 		bundle->list = entry;
 		bundle->list = entry;
 	}
 	}
-	else {
-		struct starpu_task_bundle_entry *item;
+	else
+	{
+		struct _starpu_task_bundle_entry *item;
 		item = bundle->list;
 		item = bundle->list;
 		while (item->next)
 		while (item->next)
 			item = item->next;
 			item = item->next;
@@ -97,24 +80,28 @@ int starpu_task_bundle_insert(struct starpu_task_bundle *bundle, struct starpu_t
 		item->next = entry;
 		item->next = entry;
 	}
 	}
 
 
+	/* Mark the task as belonging the bundle */
 	task->bundle = bundle;
 	task->bundle = bundle;
 
 
-	PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
 	return 0;
 	return 0;
 }
 }
 
 
-/* Remove a task from a bundle. This method must be called with bundle->mutex
- * hold. This function returns 0 if the task was found, -ENOENT if the element
- * was not found, 1 if the element is found and if the list was deinitialized
- * because it became empty. */
-int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_task *task)
+int starpu_task_bundle_remove(starpu_task_bundle_t bundle, struct starpu_task *task)
 {
 {
-	struct starpu_task_bundle_entry *item;
+	struct _starpu_task_bundle_entry *item;
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&bundle->mutex);
 
 
 	item = bundle->list;
 	item = bundle->list;
 
 
+	/* List is empty, there is no way the task
+	 * belong to it */
 	if (!item)
 	if (!item)
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
 		return -ENOENT;
 		return -ENOENT;
+	}
 
 
 	STARPU_ASSERT(task->bundle == bundle);
 	STARPU_ASSERT(task->bundle == bundle);
 	task->bundle = NULL;
 	task->bundle = NULL;
@@ -128,23 +115,27 @@ int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_t
 		/* If the list is now empty, deinitialize the bundle */
 		/* If the list is now empty, deinitialize the bundle */
 		if (bundle->closed && bundle->list == NULL)
 		if (bundle->closed && bundle->list == NULL)
 		{
 		{
-			PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
-			starpu_task_bundle_deinit(bundle);
-			return 1;
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+			_starpu_task_bundle_destroy(bundle);
+			return 0;
 		}
 		}
 
 
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
 		return 0;
 		return 0;
 	}
 	}
 
 
+	/* Go through the list until we find the right task,
+	 * then we delete it */
 	while (item->next)
 	while (item->next)
 	{
 	{
-		struct starpu_task_bundle_entry *next;
+		struct _starpu_task_bundle_entry *next;
 		next = item->next;
 		next = item->next;
 
 
 		if (next->task == task)
 		if (next->task == task)
 		{
 		{
 			/* Remove the next element */
 			/* Remove the next element */
 			item->next = next->next;
 			item->next = next->next;
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
 			free(next);
 			free(next);
 			return 0;
 			return 0;
 		}
 		}
@@ -152,101 +143,61 @@ int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_t
 		item = next;
 		item = next;
 	}
 	}
 
 
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+
 	/* We could not find the task in the bundle */
 	/* We could not find the task in the bundle */
 	return -ENOENT;
 	return -ENOENT;
 }
 }
 
 
-/* Close a bundle. No task can be added to a closed bundle. A closed bundle
- * automatically gets deinitialized when it becomes empty. */
-void starpu_task_bundle_close(struct starpu_task_bundle *bundle)
+/* Close a bundle. No task can be added to a closed bundle. Tasks can still be
+ * removed from a closed bundle. A closed bundle automatically gets
+ * deinitialized when it becomes empty. A closed bundle cannot be reopened. */
+void starpu_task_bundle_close(starpu_task_bundle_t bundle)
 {
 {
-	PTHREAD_MUTEX_LOCK(&bundle->mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&bundle->mutex);
 
 
-	/* If the bundle is already empty, we deinitialize it now. */
+	/* If the bundle is already empty, we deinitialize it now as the
+	 * user closed it and thus don't intend to insert new tasks in it. */
 	if (bundle->list == NULL)
 	if (bundle->list == NULL)
 	{
 	{
-		PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
-		starpu_task_bundle_deinit(bundle);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+		_starpu_task_bundle_destroy(bundle);
 		return;
 		return;
 	}
 	}
 
 
 	/* Mark the bundle as closed */
 	/* Mark the bundle as closed */
 	bundle->closed = 1;
 	bundle->closed = 1;
 
 
-	PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
 
 
 }
 }
 
 
-/* Return the expected duration of the entire task bundle in µs */
-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl)
+void _starpu_task_bundle_destroy(starpu_task_bundle_t bundle)
 {
 {
-	double expected_length = 0.0;
-
-	/* We expect the length of the bundle the be the sum of the different tasks length. */
-	PTHREAD_MUTEX_LOCK(&bundle->mutex);
-
-	struct starpu_task_bundle_entry *entry;
-	entry = bundle->list;
-
-	while (entry) {
-		double task_length = starpu_task_expected_length(entry->task, arch, nimpl);
-
-		/* In case the task is not calibrated, we consider the task
-		 * ends immediately. */
-		if (task_length > 0.0)
-			expected_length += task_length;
-
-		entry = entry->next;
+	/* Remove all entries from the bundle (which is likely to be empty) */
+	while (bundle->list)
+	{
+		struct _starpu_task_bundle_entry *entry = bundle->list;
+		bundle->list = bundle->list->next;
+		free(entry);
 	}
 	}
-	
-	PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
 
 
-	return expected_length;
-}
-
-/* Return the expected power consumption of the entire task bundle in J */
-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl)
-{
-	double expected_power = 0.0;
+	_STARPU_PTHREAD_MUTEX_DESTROY(&bundle->mutex);
 
 
-	/* We expect total consumption of the bundle the be the sum of the different tasks consumption. */
-	PTHREAD_MUTEX_LOCK(&bundle->mutex);
-
-	struct starpu_task_bundle_entry *entry;
-	entry = bundle->list;
-
-	while (entry) {
-		double task_power = starpu_task_expected_power(entry->task, arch, nimpl);
-
-		/* In case the task is not calibrated, we consider the task
-		 * ends immediately. */
-		if (task_power > 0.0)
-			expected_power += task_power;
-
-		entry = entry->next;
-	}
-	
-	PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
-
-	return expected_power;
+	free(bundle);
 }
 }
 
 
-struct handle_list {
-	starpu_data_handle handle;
-	starpu_access_mode mode;
-	struct handle_list *next;
-};
-
-static void insertion_handle_sorted(struct handle_list **listp, starpu_data_handle handle, starpu_access_mode mode)
+void _insertion_handle_sorted(struct _starpu_handle_list **listp, starpu_data_handle_t handle, enum starpu_access_mode mode)
 {
 {
 	STARPU_ASSERT(listp);
 	STARPU_ASSERT(listp);
 
 
-	struct handle_list *list = *listp;
+	struct _starpu_handle_list *list = *listp;
 
 
+	/* If the list is empty or the handle's address the smallest among the
+	 * list, we insert it as first element */
 	if (!list || list->handle > handle)
 	if (!list || list->handle > handle)
 	{
 	{
-		/* We insert the first element of the list */
-		struct handle_list *link = (struct handle_list *) malloc(sizeof(struct handle_list));
+		struct _starpu_handle_list *link = (struct _starpu_handle_list *) malloc(sizeof(struct _starpu_handle_list));
 		STARPU_ASSERT(link);
 		STARPU_ASSERT(link);
 		link->handle = handle;
 		link->handle = handle;
 		link->mode = mode;
 		link->mode = mode;
@@ -255,25 +206,26 @@ static void insertion_handle_sorted(struct handle_list **listp, starpu_data_hand
 		return;
 		return;
 	}
 	}
 
 
-	/* Look for the element or a place to insert it. */
-	struct handle_list *prev = list;
+	struct _starpu_handle_list *prev = list;
 
 
-	while (list && (handle > list->handle))
+	/* Look for the same handle if already present in the list.
+	 * Else place it right before the smallest following handle */
+	while (list && (handle >= list->handle))
 	{
 	{
 		prev = list;
 		prev = list;
 		list = list->next;
 		list = list->next;
 	}
 	}
 
 
-	/* The element should be in prev or not in the list */
-
 	if (prev->handle == handle)
 	if (prev->handle == handle)
 	{
 	{
-		/* The handle is already in the list */
-		prev->mode |= mode;
+		/* The handle is already in the list, the merge both the access modes */
+		prev->mode = (enum starpu_access_mode) ((int) prev->mode | (int) mode);
 	}
 	}
-	else {
-		/* The handle was not in the list, we insert it after prev */
-		struct handle_list *link = (struct handle_list *) malloc(sizeof(struct handle_list));
+	else
+	{
+		/* The handle was not in the list, we insert it after 'prev', thus right before
+		 * 'list' which is the smallest following handle */
+		struct _starpu_handle_list *link = (struct _starpu_handle_list *) malloc(sizeof(struct _starpu_handle_list));
 		STARPU_ASSERT(link);
 		STARPU_ASSERT(link);
 		link->handle = handle;
 		link->handle = handle;
 		link->mode = mode;
 		link->mode = mode;
@@ -281,59 +233,3 @@ static void insertion_handle_sorted(struct handle_list **listp, starpu_data_hand
 		prev->next = link;
 		prev->next = link;
 	}
 	}
 }
 }
-
-/* Return the time (in µs) expected to transfer all data used within the bundle */
-double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node)
-{
-	PTHREAD_MUTEX_LOCK(&bundle->mutex);
-
-	struct handle_list *handles = NULL;
-
-	/* We list all the handle that are accessed within the bundle. */
-
-	/* For each task in the bundle */
-	struct starpu_task_bundle_entry *entry = bundle->list;
-	while (entry) {
-		struct starpu_task *task = entry->task;
-
-		if (task->cl)
-		{
-			unsigned b;
-			for (b = 0; b < task->cl->nbuffers; b++)
-			{
-				starpu_data_handle handle = task->buffers[b].handle;
-				starpu_access_mode mode = task->buffers[b].mode;
-
-				if (!(mode & STARPU_R))
-					continue;
-
-				/* Insert the handle in the sorted list in case
-				 * it's not already in that list. */
-				insertion_handle_sorted(&handles, handle, mode);
-			}
-		}
-
-		entry = entry->next;
-	}
-
-	/* Compute the sum of data transfer time, and destroy the list */
-
-	double total_exp = 0.0;
-
-	while (handles)
-	{
-		struct handle_list *current = handles;
-		handles = handles->next;
-
-		double exp;
-		exp = starpu_data_expected_transfer_time(current->handle, memory_node, current->mode);
-
-		total_exp += exp;
-
-		free(current);
-	}
-
-	PTHREAD_MUTEX_UNLOCK(&bundle->mutex);
-
-	return total_exp;
-}

+ 142 - 0
src/core/task_bundle.h

@@ -0,0 +1,142 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012 Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __CORE_TASK_BUNDLE_H__
+#define __CORE_TASK_BUNDLE_H__
+
+#if ! defined(_MSC_VER)
+#  include <pthread.h>
+#endif
+
+/* struct _starpu_task_bundle_entry
+ * ================================
+ * Purpose
+ * =======
+ * Structure used to describe a linked list containing tasks in _starpu_task_bundle.
+ *
+ * Fields
+ * ======
+ * task			Pointer to the task structure.
+ *
+ * next			Pointer to the next element in the linked list.
+ */
+
+struct _starpu_task_bundle_entry
+{
+	struct starpu_task *task;
+	struct _starpu_task_bundle_entry *next;
+};
+
+/* struct _starpu_task_bundle
+ * ==========================
+ * Purpose
+ * =======
+ * Structure describing a list of tasks that should be scheduled on the same
+ * worker whenever it's possible.
+ * It must be considered as a hint given to the scheduler as there is no guarantee that
+ * they will be executed on the same worker.
+ *
+ * Fields
+ * ======
+ * mutex		Mutex protecting the structure.
+ *
+ * list			Array of tasks included in the bundle.
+ *
+ * closed		Used to know if the user is still willing to
+ * 			add/remove some tasks in the bundle. Especially useful for
+ * 			the runtime to know whether it is safe to destroy a bundle.
+ */
+
+struct _starpu_task_bundle
+{
+	/* Mutex protecting the bundle */
+#if defined(_MSC_VER)
+	void *mutex;
+#else
+	pthread_mutex_t mutex;
+#endif
+
+	struct _starpu_task_bundle_entry *list;
+
+	int closed;
+};
+
+/* struct _starpu_handle_list
+ * ==========================
+ * Purpose
+ * =======
+ * Structure describing a list of handles sorted by address to speed-up
+ * when looking for an element.
+ * The list cannot containes duplicate handles.
+ *
+ * Fields
+ * ======
+ * handle		Pointer to the handle structure.
+ *
+ * access_mode		Total access mode over the whole bundle.
+ *
+ * next			Pointer to the next element in the linked list.
+ */
+
+struct _starpu_handle_list
+{
+	starpu_data_handle_t handle;
+	enum starpu_access_mode mode;
+	struct _starpu_handle_list *next;
+};
+
+/* _starpu_task_bundle_destroy
+ * ==========================
+ * Purpose
+ * =======
+ * Destroy and deinitialize a bundle,
+ * memory previoulsy allocated is freed.
+ *
+ * Arguments
+ * =========
+ * bundle		(input)
+ * 			Bundle to destroy.
+ */
+void _starpu_task_bundle_destroy(starpu_task_bundle_t bundle);
+
+/* _insertion_handle_sorted
+ * ========================
+ * Purpose
+ * =======
+ * Insert an handle in a _starpu_handle_list, elements are sorted
+ * in increasing order, considering their physical address.
+ * As the list doesn't accept duplicate elements, a handle with the
+ * same address as an handle contained in the list is not inserted, but
+ * its mode access is merged with the one of the latter.
+ *
+ * Arguments
+ * =========
+ * listp		(input, output)
+ * 			Pointer to the first element of the list.
+ * 			In the case of an empty list or an inserted handle with small address,
+ * 			it should have changed when the call returns.
+ *
+ * handle		(input)
+ * 			Handle to insert in the list.
+ *
+ * mode			(input)
+ * 			Access mode of the handle.
+ */
+void _insertion_handle_sorted(struct _starpu_handle_list **listp, starpu_data_handle_t handle, enum starpu_access_mode mode);
+
+#endif // __CORE_TASK_BUNDLE_H__

+ 240 - 163
src/core/topology.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,7 +23,7 @@
 #include <core/debug.h>
 #include <core/debug.h>
 #include <core/topology.h>
 #include <core/topology.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
-#include <common/hash.h>
+#include <starpu_hash.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
@@ -43,18 +43,17 @@
 #define hwloc_bitmap_singlify hwloc_cpuset_singlify
 #define hwloc_bitmap_singlify hwloc_cpuset_singlify
 #endif
 #endif
 
 
-		
 static unsigned topology_is_initialized = 0;
 static unsigned topology_is_initialized = 0;
 
 
-static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *config);
+static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *config);
 
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 #  ifdef STARPU_USE_CUDA
 #  ifdef STARPU_USE_CUDA
-static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config);
-static struct starpu_htbl32_node_s *devices_using_cuda = NULL;
+static void _starpu_initialize_workers_cuda_gpuid(struct _starpu_machine_config *config);
+static struct starpu_htbl32_node *devices_using_cuda = NULL;
 #  endif
 #  endif
 #  ifdef STARPU_USE_OPENCL
 #  ifdef STARPU_USE_OPENCL
-static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config);
+static void _starpu_initialize_workers_opencl_gpuid(struct _starpu_machine_config *config);
 #  endif
 #  endif
 static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
 static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int *explicit_workers_gpuid,
                                              int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus);
                                              int *current, int *workers_gpuid, const char *varname, unsigned nhwgpus);
@@ -66,9 +65,9 @@ static unsigned may_bind_automatically = 0;
  */
  */
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s *config)
+static void _starpu_initialize_workers_cuda_gpuid(struct _starpu_machine_config *config)
 {
 {
-	struct starpu_machine_topology_s *topology = &config->topology;
+	struct starpu_machine_topology *topology = &config->topology;
 
 
         _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_cuda_gpuid,
         _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_cuda_gpuid,
                                          config->user_conf==NULL?NULL:(int *)config->user_conf->workers_cuda_gpuid,
                                          config->user_conf==NULL?NULL:(int *)config->user_conf->workers_cuda_gpuid,
@@ -78,9 +77,9 @@ static void _starpu_initialize_workers_cuda_gpuid(struct starpu_machine_config_s
 #endif
 #endif
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config_s *config)
+static void _starpu_initialize_workers_opencl_gpuid(struct _starpu_machine_config *config)
 {
 {
-	struct starpu_machine_topology_s *topology = &config->topology;
+	struct starpu_machine_topology *topology = &config->topology;
 
 
         _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_opencl_gpuid,
         _starpu_initialize_workers_gpuid(config->user_conf==NULL?0:config->user_conf->use_explicit_workers_opencl_gpuid,
                                          config->user_conf==NULL?NULL:(int *)config->user_conf->workers_opencl_gpuid,
                                          config->user_conf==NULL?NULL:(int *)config->user_conf->workers_opencl_gpuid,
@@ -93,9 +92,11 @@ static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config
                 unsigned tmp[STARPU_NMAXWORKERS];
                 unsigned tmp[STARPU_NMAXWORKERS];
                 unsigned nb=0;
                 unsigned nb=0;
                 int i;
                 int i;
-                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
-                        uint32_t key = _starpu_crc32_be(config->topology.workers_opencl_gpuid[i], 0);
-                        if (_starpu_htbl_search_32(devices_using_cuda, key) == NULL) {
+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++)
+		{
+                        uint32_t key = starpu_crc32_be(config->topology.workers_opencl_gpuid[i], 0);
+                        if (_starpu_htbl_search_32(devices_using_cuda, key) == NULL)
+			{
                                 tmp[nb] = topology->workers_opencl_gpuid[i];
                                 tmp[nb] = topology->workers_opencl_gpuid[i];
                                 nb++;
                                 nb++;
                         }
                         }
@@ -106,14 +107,16 @@ static void _starpu_initialize_workers_opencl_gpuid(struct starpu_machine_config
 #endif /* STARPU_USE_CUDA */
 #endif /* STARPU_USE_CUDA */
         {
         {
                 // Detect identical devices
                 // Detect identical devices
-                struct starpu_htbl32_node_s *devices_already_used = NULL;
+                struct starpu_htbl32_node *devices_already_used = NULL;
                 unsigned tmp[STARPU_NMAXWORKERS];
                 unsigned tmp[STARPU_NMAXWORKERS];
                 unsigned nb=0;
                 unsigned nb=0;
                 int i;
                 int i;
 
 
-                for(i=0 ; i<STARPU_NMAXWORKERS ; i++) {
-                        uint32_t key = _starpu_crc32_be(topology->workers_opencl_gpuid[i], 0);
-                        if (_starpu_htbl_search_32(devices_already_used, key) == NULL) {
+                for(i=0 ; i<STARPU_NMAXWORKERS ; i++)
+		{
+                        uint32_t key = starpu_crc32_be(topology->workers_opencl_gpuid[i], 0);
+                        if (_starpu_htbl_search_32(devices_already_used, key) == NULL)
+			{
                                 _starpu_htbl_insert_32(&devices_already_used, key, config);
                                 _starpu_htbl_insert_32(&devices_already_used, key, config);
                                 tmp[nb] = topology->workers_opencl_gpuid[i];
                                 tmp[nb] = topology->workers_opencl_gpuid[i];
                                 nb ++;
                                 nb ++;
@@ -143,14 +146,7 @@ static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int
 	 * cpus. */
 	 * cpus. */
 
 
 	/* what do we use, explicit value, env. variable, or round-robin ? */
 	/* what do we use, explicit value, env. variable, or round-robin ? */
-	if (use_explicit_workers_gpuid)
-	{
-		/* we use the explicit value from the user */
-		memcpy(workers_gpuid,
-                       explicit_workers_gpuid,
-                       STARPU_NMAXWORKERS*sizeof(unsigned));
-	}
-	else if ((strval = getenv(varname)))
+	if ((strval = getenv(varname)))
 	{
 	{
 		/* STARPU_WORKERS_CUDAID certainly contains less entries than
 		/* STARPU_WORKERS_CUDAID certainly contains less entries than
 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
@@ -162,7 +158,8 @@ static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int
 		/* we use the content of the STARPU_WORKERS_CUDAID env. variable */
 		/* we use the content of the STARPU_WORKERS_CUDAID env. variable */
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		{
 		{
-			if (!wrap) {
+			if (!wrap)
+			{
 				long int val;
 				long int val;
 				val = strtol(strval, &endptr, 10);
 				val = strtol(strval, &endptr, 10);
 				if (endptr != strval)
 				if (endptr != strval)
@@ -170,22 +167,31 @@ static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int
 					workers_gpuid[i] = (unsigned)val;
 					workers_gpuid[i] = (unsigned)val;
 					strval = endptr;
 					strval = endptr;
 				}
 				}
-				else {
+				else
+				{
 					/* there must be at least one entry */
 					/* there must be at least one entry */
 					STARPU_ASSERT(i != 0);
 					STARPU_ASSERT(i != 0);
 					number_of_entries = i;
 					number_of_entries = i;
-	
+
 					/* there is no more values in the string */
 					/* there is no more values in the string */
 					wrap = 1;
 					wrap = 1;
 
 
 					workers_gpuid[i] = workers_gpuid[0];
 					workers_gpuid[i] = workers_gpuid[0];
 				}
 				}
 			}
 			}
-			else {
+			else
+			{
 				workers_gpuid[i] = workers_gpuid[i % number_of_entries];
 				workers_gpuid[i] = workers_gpuid[i % number_of_entries];
 			}
 			}
 		}
 		}
 	}
 	}
+	else if (use_explicit_workers_gpuid)
+	{
+		/* we use the explicit value from the user */
+		memcpy(workers_gpuid,
+                       explicit_workers_gpuid,
+                       STARPU_NMAXWORKERS*sizeof(unsigned));
+	}
 	else
 	else
 	{
 	{
 		/* by default, we take a round robin policy */
 		/* by default, we take a round robin policy */
@@ -200,7 +206,7 @@ static void _starpu_initialize_workers_gpuid(int use_explicit_workers_gpuid, int
 #endif
 #endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-static inline int _starpu_get_next_cuda_gpuid(struct starpu_machine_config_s *config)
+static inline int _starpu_get_next_cuda_gpuid(struct _starpu_machine_config *config)
 {
 {
 	unsigned i = ((config->current_cuda_gpuid++) % config->topology.ncudagpus);
 	unsigned i = ((config->current_cuda_gpuid++) % config->topology.ncudagpus);
 
 
@@ -209,7 +215,7 @@ static inline int _starpu_get_next_cuda_gpuid(struct starpu_machine_config_s *co
 #endif
 #endif
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-static inline int _starpu_get_next_opencl_gpuid(struct starpu_machine_config_s *config)
+static inline int _starpu_get_next_opencl_gpuid(struct _starpu_machine_config *config)
 {
 {
 	unsigned i = ((config->current_opencl_gpuid++) % config->topology.nopenclgpus);
 	unsigned i = ((config->current_opencl_gpuid++) % config->topology.nopenclgpus);
 
 
@@ -217,9 +223,9 @@ static inline int _starpu_get_next_opencl_gpuid(struct starpu_machine_config_s *
 }
 }
 #endif
 #endif
 
 
-static void _starpu_init_topology(struct starpu_machine_config_s *config)
+static void _starpu_init_topology(struct _starpu_machine_config *config)
 {
 {
-	struct starpu_machine_topology_s *topology = &config->topology;
+	struct starpu_machine_topology *topology = &config->topology;
 
 
 	if (!topology_is_initialized)
 	if (!topology_is_initialized)
 	{
 	{
@@ -260,24 +266,21 @@ static void _starpu_init_topology(struct starpu_machine_config_s *config)
 	}
 	}
 }
 }
 
 
-unsigned _starpu_topology_get_nhwcpu(struct starpu_machine_config_s *config)
+unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config)
 {
 {
 	_starpu_init_topology(config);
 	_starpu_init_topology(config);
-	
+
 	return config->topology.nhwcpus;
 	return config->topology.nhwcpus;
 }
 }
 
 
-static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
+static int _starpu_init_machine_config(struct _starpu_machine_config *config,
 				struct starpu_conf *user_conf)
 				struct starpu_conf *user_conf)
 {
 {
-	int explicitval STARPU_ATTRIBUTE_UNUSED;
-	unsigned use_accelerator = 0;
-
 	int i;
 	int i;
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		config->workers[i].workerid = i;
 		config->workers[i].workerid = i;
 
 
-	struct starpu_machine_topology_s *topology = &config->topology;
+	struct starpu_machine_topology *topology = &config->topology;
 
 
 	topology->nworkers = 0;
 	topology->nworkers = 0;
 	topology->ncombinedworkers = 0;
 	topology->ncombinedworkers = 0;
@@ -287,36 +290,49 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 	_starpu_initialize_workers_bindid(config);
 	_starpu_initialize_workers_bindid(config);
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	if (user_conf && (user_conf->ncuda == 0))
+	int ncuda = -1;
+	ncuda = starpu_get_env_number("STARPU_NCUDA");
+
+	/* STARPU_NCUDA is not set. Did the user specify anything ? */
+	if (ncuda == -1 && user_conf)
+		ncuda = user_conf->ncuda;
+
+	
+	if (ncuda != 0)
 	{
 	{
-		/* the user explicitely disabled CUDA */
-		topology->ncudagpus = 0;
-	}
-	else {
-		/* we need to initialize CUDA early to count the number of devices */
+		/* The user did not disable CUDA. We need to initialize CUDA
+ 		 * early to count the number of devices */
 		_starpu_init_cuda();
 		_starpu_init_cuda();
 
 
-		if (user_conf && (user_conf->ncuda != -1))
+		if (ncuda == -1)
 		{
 		{
-			explicitval = user_conf->ncuda;
-		}
-		else {
-			explicitval = starpu_get_env_number("STARPU_NCUDA");
+			/* Nothing was specified, so let's choose ! */
+			ncuda = STARPU_MIN(_starpu_get_cuda_device_count(), STARPU_MAXCUDADEVS);
 		}
 		}
+		else
+		{
+			/* Let's make sure this value is OK. */
+			if (ncuda > STARPU_MAXCUDADEVS)
+			{
+				fprintf(stderr,
+					"# Warning: %d CUDA devices requested. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n",
+					ncuda, STARPU_MAXCUDADEVS);
+				ncuda = STARPU_MAXCUDADEVS;
+			}
 
 
-		if (explicitval < 0) {
-			config->topology.ncudagpus =
-				STARPU_MIN(_starpu_get_cuda_device_count(), STARPU_MAXCUDADEVS);
-		} else {
-			/* use the specified value */
-			topology->ncudagpus = (unsigned)explicitval;
-			STARPU_ASSERT(topology->ncudagpus <= STARPU_MAXCUDADEVS);
+			if ((unsigned) ncuda > _starpu_get_cuda_device_count())
+			{
+				fprintf(stderr,
+					"# Warning: %d CUDA devices requested. Only %d available.\n",
+					ncuda, _starpu_get_cuda_device_count());
+				ncuda = _starpu_get_cuda_device_count();
+			}
 		}
 		}
-		STARPU_ASSERT(config->topology.ncudagpus + config->topology.nworkers <= STARPU_NMAXWORKERS);
 	}
 	}
 
 
-	if (topology->ncudagpus > 0)
-		use_accelerator = 1;
+	/* Now we know how many CUDA devices will be used */
+	topology->ncudagpus = ncuda;
+	STARPU_ASSERT(topology->ncudagpus <= STARPU_MAXCUDADEVS);
 
 
 	_starpu_initialize_workers_cuda_gpuid(config);
 	_starpu_initialize_workers_cuda_gpuid(config);
 
 
@@ -325,14 +341,14 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 	{
 	{
 		config->workers[topology->nworkers + cudagpu].arch = STARPU_CUDA_WORKER;
 		config->workers[topology->nworkers + cudagpu].arch = STARPU_CUDA_WORKER;
 		int devid = _starpu_get_next_cuda_gpuid(config);
 		int devid = _starpu_get_next_cuda_gpuid(config);
-		enum starpu_perf_archtype arch = STARPU_CUDA_DEFAULT + devid;
+		enum starpu_perf_archtype arch = (enum starpu_perf_archtype)((int)STARPU_CUDA_DEFAULT + devid);
 		config->workers[topology->nworkers + cudagpu].devid = devid;
 		config->workers[topology->nworkers + cudagpu].devid = devid;
-		config->workers[topology->nworkers + cudagpu].perf_arch = arch; 
+		config->workers[topology->nworkers + cudagpu].perf_arch = arch;
 		config->workers[topology->nworkers + cudagpu].worker_mask = STARPU_CUDA;
 		config->workers[topology->nworkers + cudagpu].worker_mask = STARPU_CUDA;
 		_starpu_init_sched_ctx_for_worker(config->workers[topology->nworkers + cudagpu].workerid);
 		_starpu_init_sched_ctx_for_worker(config->workers[topology->nworkers + cudagpu].workerid);
 		config->worker_mask |= STARPU_CUDA;
 		config->worker_mask |= STARPU_CUDA;
 
 
-                uint32_t key = _starpu_crc32_be(devid, 0);
+                uint32_t key = starpu_crc32_be(devid, 0);
                 _starpu_htbl_insert_32(&devices_using_cuda, key, config);
                 _starpu_htbl_insert_32(&devices_using_cuda, key, config);
         }
         }
 
 
@@ -340,46 +356,49 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 #endif
 #endif
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	if (user_conf && (user_conf->nopencl == 0))
+	int nopencl;
+	nopencl = starpu_get_env_number("STARPU_NOPENCL");
+
+	/* STARPU_NOPENCL is not set. Did the user specify anything ? */
+	if (nopencl == -1 && user_conf)
+		nopencl = user_conf->nopencl;
+
+	if (nopencl != 0)
 	{
 	{
-		/* the user explicitely disabled OpenCL */
-		topology->nopenclgpus = 0;
-	}
-	else {
-		/* we need to initialize OpenCL early to count the number of devices */
-		int nb_devices;
+		/* The user did not disable OPENCL. We need to initialize OpenCL
+ 		 * early to count the number of devices */
 		_starpu_opencl_init();
 		_starpu_opencl_init();
+		int nb_devices;
 		nb_devices = STARPU_MIN(_starpu_opencl_get_device_count(), STARPU_MAXOPENCLDEVS);
 		nb_devices = STARPU_MIN(_starpu_opencl_get_device_count(), STARPU_MAXOPENCLDEVS);
 
 
-		if (user_conf && (user_conf->nopencl != -1))
+		if (nopencl == -1)
 		{
 		{
-			explicitval = user_conf->nopencl;
+			/* Nothing was specified, so let's choose ! */
+			nopencl = nb_devices;
 		}
 		}
-		else {
-			explicitval = starpu_get_env_number("STARPU_NOPENCL");
-		}
-
-
-		if (explicitval < 0) {
-			topology->nopenclgpus = nb_devices;
-		}
-		else {
-			if (explicitval > nb_devices) {
+		else
+		{
+			/* Let's make sure this value is OK. */
+			if (nopencl > nb_devices)
+			{
 				/* The user requires more OpenCL devices than there is available */
 				/* The user requires more OpenCL devices than there is available */
-				topology->nopenclgpus = nb_devices;
+				fprintf(stderr,
+					"# Warning: %d OpenCL devices requested. Only %d available.\n",
+					nopencl, nb_devices);
+					topology->nopenclgpus = nb_devices;
 			}
 			}
-			else {
-				/* use the specified value */
-				topology->nopenclgpus = (unsigned)explicitval;
+			if (nopencl > STARPU_MAXOPENCLDEVS)
+			{
+				fprintf(stderr,
+					"# Warning: %d OpenCL devices requested. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices.\n",
+					nopencl, STARPU_MAXOPENCLDEVS);
+				nopencl = STARPU_MAXOPENCLDEVS;
 			}
 			}
-			STARPU_ASSERT(topology->nopenclgpus <= STARPU_MAXOPENCLDEVS);
 		}
 		}
-		STARPU_ASSERT(topology->nopenclgpus + topology->nworkers <= STARPU_NMAXWORKERS);
 	}
 	}
 
 
-	if (topology->nopenclgpus > 0)
-		use_accelerator = 1;
-	// TODO: use_accelerator pour les OpenCL?
+	topology->nopenclgpus = nopencl;
+	STARPU_ASSERT(topology->nopenclgpus + topology->nworkers <= STARPU_NMAXWORKERS);
 
 
 	_starpu_initialize_workers_opencl_gpuid(config);
 	_starpu_initialize_workers_opencl_gpuid(config);
 
 
@@ -387,14 +406,15 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 	for (openclgpu = 0; openclgpu < topology->nopenclgpus; openclgpu++)
 	for (openclgpu = 0; openclgpu < topology->nopenclgpus; openclgpu++)
 	{
 	{
 		int devid = _starpu_get_next_opencl_gpuid(config);
 		int devid = _starpu_get_next_opencl_gpuid(config);
-		if (devid == -1) { // There is no more devices left
+		if (devid == -1)
+		{ // There is no more devices left
 			topology->nopenclgpus = openclgpu;
 			topology->nopenclgpus = openclgpu;
 			break;
 			break;
 		}
 		}
 		config->workers[topology->nworkers + openclgpu].arch = STARPU_OPENCL_WORKER;
 		config->workers[topology->nworkers + openclgpu].arch = STARPU_OPENCL_WORKER;
-		enum starpu_perf_archtype arch = STARPU_OPENCL_DEFAULT + devid;
+		enum starpu_perf_archtype arch = (enum starpu_perf_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
 		config->workers[topology->nworkers + openclgpu].devid = devid;
 		config->workers[topology->nworkers + openclgpu].devid = devid;
-		config->workers[topology->nworkers + openclgpu].perf_arch = arch; 
+		config->workers[topology->nworkers + openclgpu].perf_arch = arch;
 		config->workers[topology->nworkers + openclgpu].worker_mask = STARPU_OPENCL;
 		config->workers[topology->nworkers + openclgpu].worker_mask = STARPU_OPENCL;
 		_starpu_init_sched_ctx_for_worker(config->workers[topology->nworkers + openclgpu].workerid);
 		_starpu_init_sched_ctx_for_worker(config->workers[topology->nworkers + openclgpu].workerid);
 		config->worker_mask |= STARPU_OPENCL;
 		config->worker_mask |= STARPU_OPENCL;
@@ -402,26 +422,37 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 
 
 	topology->nworkers += topology->nopenclgpus;
 	topology->nworkers += topology->nopenclgpus;
 #endif
 #endif
-	
+
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
-	if (user_conf && (user_conf->ncuda != -1)) {
-		explicitval = user_conf->ncuda;
-	}
-	else {
-		explicitval = starpu_get_env_number("STARPU_NGORDON");
-	}
+	int ngordon;
+	ngordon = starpu_get_env_number("STARPU_NGORDON");
+
+	/* STARPU_NGORDON is not set. Did the user specify anything ? */
+	if (ngordon == -1 && user_conf)
+		ngordon = user_conf->ngordon;
 
 
-	if (explicitval < 0) {
-		topology->ngordon_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
-	} else {
-		/* use the specified value */
-		topology->ngordon_spus = (unsigned)explicitval;
-		STARPU_ASSERT(topology->ngordon_spus <= NMAXGORDONSPUS);
+	if (ngordon != 0)
+	{
+		if (ngordon == -1)
+		{
+			/* Nothing was specified, so let's choose ! */
+			ngordon = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
+		}
+		else
+		{
+			STARPU_ASSERT(ngordon <= NMAXGORDONSPUS);
+			if (ngordon > STARPU_MAXGORDONSPUS);
+			{
+				fprintf(stderr,
+					"# Warning: %d Gordon CPUs devices requested. Only %d supported\n",
+					ngordon, NMAXGORDONSPUS);
+				ngordon = NMAXGORDONSPUS;
+			}
+		}
 	}
 	}
-	STARPU_ASSERT(topology->ngordon_spus + topology->nworkers <= STARPU_NMAXWORKERS);
 
 
-	if (topology->ngordon_spus > 0)
-		use_accelerator = 1;
+	topology->ngordon_spus = ngordon;
+	STARPU_ASSERT(topology->ngordon_spus + topology->nworkers <= STARPU_NMAXWORKERS);
 
 
 	unsigned spu;
 	unsigned spu;
 	for (spu = 0; spu < config->ngordon_spus; spu++)
 	for (spu = 0; spu < config->ngordon_spus; spu++)
@@ -441,24 +472,37 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 /* we put the CPU section after the accelerator : in case there was an
 /* we put the CPU section after the accelerator : in case there was an
  * accelerator found, we devote one cpu */
  * accelerator found, we devote one cpu */
 #ifdef STARPU_USE_CPU
 #ifdef STARPU_USE_CPU
-	if (user_conf && (user_conf->ncpus != -1)) {
-		explicitval = user_conf->ncpus;
-	}
-	else {
-		explicitval = starpu_get_env_number("STARPU_NCPUS");
-	}
+	int ncpu;
+	ncpu = starpu_get_env_number("STARPU_NCPUS");
 
 
-	if (explicitval < 0) {
-		unsigned already_busy_cpus = (topology->ngordon_spus?1:0) + topology->ncudagpus + topology->nopenclgpus;
-		long avail_cpus = topology->nhwcpus - (use_accelerator?already_busy_cpus:0);
-		if (avail_cpus < 0)
-			avail_cpus = 0;
-		topology->ncpus = STARPU_MIN(avail_cpus, STARPU_MAXCPUS);
-	} else {
-		/* use the specified value */
-		topology->ncpus = (unsigned)explicitval;
-		STARPU_ASSERT(topology->ncpus <= STARPU_MAXCPUS);
+	/* STARPU_NCPUS is not set. Did the user specify anything ? */
+	if (ncpu == -1 && user_conf)
+		ncpu = user_conf->ncpus;
+
+	if (ncpu != 0)
+	{
+		if (ncpu == -1)
+		{
+			unsigned already_busy_cpus = (topology->ngordon_spus?1:0) + topology->ncudagpus + topology->nopenclgpus;
+			long avail_cpus = topology->nhwcpus - already_busy_cpus;
+			if (avail_cpus < 0)
+				avail_cpus = 0;
+			ncpu = STARPU_MIN(avail_cpus, STARPU_MAXCPUS);
+		}
+		else
+		{
+			if (ncpu > STARPU_MAXCPUS)
+			{
+				fprintf(stderr,
+					"# Warning: %d CPU devices requested. Only %d enabled. Use configure option --enable-maxcpus=xxx to update the maximum value of supported CPU devices.\n",
+					ncpu, STARPU_MAXCPUS);
+				ncpu = STARPU_MAXCPUS;
+			}
+		}
 	}
 	}
+
+
+	topology->ncpus = ncpu;
 	STARPU_ASSERT(topology->ncpus + topology->nworkers <= STARPU_NMAXWORKERS);
 	STARPU_ASSERT(topology->ncpus + topology->nworkers <= STARPU_NMAXWORKERS);
 
 
 	unsigned cpu;
 	unsigned cpu;
@@ -487,12 +531,12 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 /*
 /*
  * Bind workers on the different processors
  * Bind workers on the different processors
  */
  */
-static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *config)
+static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *config)
 {
 {
 	char *strval;
 	char *strval;
 	unsigned i;
 	unsigned i;
 
 
-	struct starpu_machine_topology_s *topology = &config->topology;
+	struct starpu_machine_topology *topology = &config->topology;
 
 
 	config->current_bindid = 0;
 	config->current_bindid = 0;
 
 
@@ -504,14 +548,7 @@ static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *co
 	 * cpus. */
 	 * cpus. */
 
 
 	/* what do we use, explicit value, env. variable, or round-robin ? */
 	/* what do we use, explicit value, env. variable, or round-robin ? */
-	if (config->user_conf && config->user_conf->use_explicit_workers_bindid)
-	{
-		/* we use the explicit value from the user */
-		memcpy(topology->workers_bindid,
-			config->user_conf->workers_bindid,
-			STARPU_NMAXWORKERS*sizeof(unsigned));
-	}
-	else if ((strval = getenv("STARPU_WORKERS_CPUID")))
+	if ((strval = getenv("STARPU_WORKERS_CPUID")))
 	{
 	{
 		/* STARPU_WORKERS_CPUID certainly contains less entries than
 		/* STARPU_WORKERS_CPUID certainly contains less entries than
 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
 		 * STARPU_NMAXWORKERS, so we reuse its entries in a round robin
@@ -523,7 +560,8 @@ static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *co
 		/* we use the content of the STARPU_WORKERS_CUDAID env. variable */
 		/* we use the content of the STARPU_WORKERS_CUDAID env. variable */
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		{
 		{
-			if (!wrap) {
+			if (!wrap)
+			{
 				long int val;
 				long int val;
 				val = strtol(strval, &endptr, 10);
 				val = strtol(strval, &endptr, 10);
 				if (endptr != strval)
 				if (endptr != strval)
@@ -531,7 +569,8 @@ static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *co
 					topology->workers_bindid[i] = (unsigned)(val % topology->nhwcpus);
 					topology->workers_bindid[i] = (unsigned)(val % topology->nhwcpus);
 					strval = endptr;
 					strval = endptr;
 				}
 				}
-				else {
+				else
+				{
 					/* there must be at least one entry */
 					/* there must be at least one entry */
 					STARPU_ASSERT(i != 0);
 					STARPU_ASSERT(i != 0);
 					number_of_entries = i;
 					number_of_entries = i;
@@ -542,11 +581,19 @@ static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *co
 					topology->workers_bindid[i] = topology->workers_bindid[0];
 					topology->workers_bindid[i] = topology->workers_bindid[0];
 				}
 				}
 			}
 			}
-			else {
+			else
+			{
 				topology->workers_bindid[i] = topology->workers_bindid[i % number_of_entries];
 				topology->workers_bindid[i] = topology->workers_bindid[i % number_of_entries];
 			}
 			}
 		}
 		}
 	}
 	}
+	else if (config->user_conf && config->user_conf->use_explicit_workers_bindid)
+	{
+		/* we use the explicit value from the user */
+		memcpy(topology->workers_bindid,
+			config->user_conf->workers_bindid,
+			STARPU_NMAXWORKERS*sizeof(unsigned));
+	}
 	else
 	else
 	{
 	{
 		/* by default, we take a round robin policy */
 		/* by default, we take a round robin policy */
@@ -559,10 +606,10 @@ static void _starpu_initialize_workers_bindid(struct starpu_machine_config_s *co
  * worker. In case a list of preferred cpus was specified, we look for a an
  * worker. In case a list of preferred cpus was specified, we look for a an
  * available cpu among the list if possible, otherwise a round-robin policy is
  * available cpu among the list if possible, otherwise a round-robin policy is
  * used. */
  * used. */
-static inline int _starpu_get_next_bindid(struct starpu_machine_config_s *config,
+static inline int _starpu_get_next_bindid(struct _starpu_machine_config *config,
 				int *preferred_binding, int npreferred)
 				int *preferred_binding, int npreferred)
 {
 {
-	struct starpu_machine_topology_s *topology = &config->topology;
+	struct starpu_machine_topology *topology = &config->topology;
 
 
 	unsigned found = 0;
 	unsigned found = 0;
 	int current_preferred;
 	int current_preferred;
@@ -600,23 +647,32 @@ static inline int _starpu_get_next_bindid(struct starpu_machine_config_s *config
 	return (int)topology->workers_bindid[i];
 	return (int)topology->workers_bindid[i];
 }
 }
 
 
-void _starpu_bind_thread_on_cpu(struct starpu_machine_config_s *config STARPU_ATTRIBUTE_UNUSED, unsigned cpuid)
+void _starpu_bind_thread_on_cpu(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED, unsigned cpuid)
 {
 {
+	if (starpu_get_env_number("STARPU_WORKERS_NOBIND") > 0)
+		return;
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
-	int ret;
+	const struct hwloc_topology_support *support;
+
 	_starpu_init_topology(config);
 	_starpu_init_topology(config);
 
 
-	hwloc_obj_t obj = hwloc_get_obj_by_depth(config->topology.hwtopology, config->cpu_depth, cpuid);
-	hwloc_cpuset_t set = obj->cpuset;
-	hwloc_bitmap_singlify(set);
-	ret = hwloc_set_cpubind(config->topology.hwtopology, set, HWLOC_CPUBIND_THREAD);
-	if (ret)
+	support = hwloc_topology_get_support(config->topology.hwtopology);
+	if (support->cpubind->set_thisthread_cpubind)
 	{
 	{
-		perror("binding thread");
-		STARPU_ABORT();
+		hwloc_obj_t obj = hwloc_get_obj_by_depth(config->topology.hwtopology, config->cpu_depth, cpuid);
+		hwloc_cpuset_t set = obj->cpuset;
+		int ret;
+
+		hwloc_bitmap_singlify(set);
+		ret = hwloc_set_cpubind(config->topology.hwtopology, set, HWLOC_CPUBIND_THREAD);
+		if (ret)
+		{
+			perror("binding thread");
+			STARPU_ABORT();
+		}
 	}
 	}
 
 
-#elif defined(HAVE_PTHREAD_SETAFFINITY_NP)
+#elif defined(HAVE_PTHREAD_SETAFFINITY_NP) && defined(__linux__)
 	int ret;
 	int ret;
 	/* fix the thread on the correct cpu */
 	/* fix the thread on the correct cpu */
 	cpu_set_t aff_mask;
 	cpu_set_t aff_mask;
@@ -634,7 +690,8 @@ void _starpu_bind_thread_on_cpu(struct starpu_machine_config_s *config STARPU_AT
 
 
 #elif defined(__MINGW32__) || defined(__CYGWIN__)
 #elif defined(__MINGW32__) || defined(__CYGWIN__)
 	DWORD mask = 1 << cpuid;
 	DWORD mask = 1 << cpuid;
-	if (!SetThreadAffinityMask(GetCurrentThread(), mask)) {
+	if (!SetThreadAffinityMask(GetCurrentThread(), mask))
+	{
 		fprintf(stderr,"SetThreadMaskAffinity(%lx) failed\n", mask);
 		fprintf(stderr,"SetThreadMaskAffinity(%lx) failed\n", mask);
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}
@@ -664,23 +721,26 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 	{
 	{
 		unsigned memory_node = -1;
 		unsigned memory_node = -1;
 		unsigned is_a_set_of_accelerators = 0;
 		unsigned is_a_set_of_accelerators = 0;
-		struct starpu_worker_s *workerarg = &config->workers[worker];
+		struct _starpu_worker *workerarg = &config->workers[worker];
 
 
 		/* Perhaps the worker has some "favourite" bindings  */
 		/* Perhaps the worker has some "favourite" bindings  */
 		int *preferred_binding = NULL;
 		int *preferred_binding = NULL;
 		int npreferred = 0;
 		int npreferred = 0;
-		
+
 		/* select the memory node that contains worker's memory */
 		/* select the memory node that contains worker's memory */
-		switch (workerarg->arch) {
+		switch (workerarg->arch)
+		{
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 			/* "dedicate" a cpu cpu to that worker */
 			/* "dedicate" a cpu cpu to that worker */
 				is_a_set_of_accelerators = 0;
 				is_a_set_of_accelerators = 0;
 				memory_node = ram_memory_node;
 				memory_node = ram_memory_node;
+				_starpu_memory_node_worker_add(ram_memory_node);
 				break;
 				break;
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 			case STARPU_GORDON_WORKER:
 				is_a_set_of_accelerators = 1;
 				is_a_set_of_accelerators = 1;
 				memory_node = ram_memory_node;
 				memory_node = ram_memory_node;
+				_starpu_memory_node_worker_add(ram_memory_node);
 				break;
 				break;
 #endif
 #endif
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -693,9 +753,23 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				}
 				}
 				is_a_set_of_accelerators = 0;
 				is_a_set_of_accelerators = 0;
 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM, workerarg->devid);
 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM, workerarg->devid);
+				_starpu_memory_node_worker_add(memory_node);
 
 
 				_starpu_register_bus(0, memory_node);
 				_starpu_register_bus(0, memory_node);
 				_starpu_register_bus(memory_node, 0);
 				_starpu_register_bus(memory_node, 0);
+#ifdef HAVE_CUDA_MEMCPY_PEER
+				unsigned worker2;
+				for (worker2 = 0; worker2 < worker; worker2++)
+				{
+					struct _starpu_worker *workerarg = &config->workers[worker];
+					if (workerarg->arch == STARPU_CUDA_WORKER)
+					{
+						unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
+						_starpu_register_bus(memory_node2, memory_node);
+						_starpu_register_bus(memory_node, memory_node2);
+					}
+				}
+#endif
 				break;
 				break;
 #endif
 #endif
 
 
@@ -709,6 +783,7 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				}
 				}
 				is_a_set_of_accelerators = 0;
 				is_a_set_of_accelerators = 0;
 				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM, workerarg->devid);
 				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM, workerarg->devid);
+				_starpu_memory_node_worker_add(memory_node);
 				_starpu_register_bus(0, memory_node);
 				_starpu_register_bus(0, memory_node);
 				_starpu_register_bus(memory_node, 0);
 				_starpu_register_bus(memory_node, 0);
 				break;
 				break;
@@ -718,13 +793,15 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				STARPU_ABORT();
 				STARPU_ABORT();
 		}
 		}
 
 
-		if (is_a_set_of_accelerators) {
+		if (is_a_set_of_accelerators)
+		{
 			if (accelerator_bindid == -1)
 			if (accelerator_bindid == -1)
 				accelerator_bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 				accelerator_bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 
 
 			workerarg->bindid = accelerator_bindid;
 			workerarg->bindid = accelerator_bindid;
 		}
 		}
-		else {
+		else
+		{
 			workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 			workerarg->bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 		}
 		}
 
 
@@ -755,7 +832,7 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 }
 }
 
 
 
 
-int _starpu_build_topology(struct starpu_machine_config_s *config)
+int _starpu_build_topology(struct _starpu_machine_config *config)
 {
 {
 	int ret;
 	int ret;
 
 
@@ -773,7 +850,7 @@ int _starpu_build_topology(struct starpu_machine_config_s *config)
 	return 0;
 	return 0;
 }
 }
 
 
-void _starpu_destroy_topology(struct starpu_machine_config_s *config __attribute__ ((unused)))
+void _starpu_destroy_topology(struct _starpu_machine_config *config __attribute__ ((unused)))
 {
 {
 	/* cleanup StarPU internal data structures */
 	/* cleanup StarPU internal data structures */
 	_starpu_deinit_memory_nodes();
 	_starpu_deinit_memory_nodes();
@@ -782,7 +859,7 @@ void _starpu_destroy_topology(struct starpu_machine_config_s *config __attribute
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	{
 	{
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
-		struct starpu_worker_s *workerarg = &config->workers[worker];
+		struct _starpu_worker *workerarg = &config->workers[worker];
 		hwloc_bitmap_free(workerarg->initial_hwloc_cpu_set);
 		hwloc_bitmap_free(workerarg->initial_hwloc_cpu_set);
 		hwloc_bitmap_free(workerarg->current_hwloc_cpu_set);
 		hwloc_bitmap_free(workerarg->current_hwloc_cpu_set);
 #endif
 #endif

+ 10 - 6
src/core/topology.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,20 +24,24 @@
 #include <common/fxt.h>
 #include <common/fxt.h>
 
 
 /* TODO actually move this struct into this header */
 /* TODO actually move this struct into this header */
-struct starpu_machine_config_s;
+struct _starpu_machine_config;
 
 
 /* Detect the number of memory nodes and where to bind the different workers. */
 /* Detect the number of memory nodes and where to bind the different workers. */
-int _starpu_build_topology(struct starpu_machine_config_s *config);
+int _starpu_build_topology(struct _starpu_machine_config *config);
 
 
 /* Destroy all resources used to store the topology of the machine. */
 /* Destroy all resources used to store the topology of the machine. */
-void _starpu_destroy_topology(struct starpu_machine_config_s *config);
+void _starpu_destroy_topology(struct _starpu_machine_config *config);
 
 
 /* returns the number of physical cpus */
 /* returns the number of physical cpus */
-unsigned _starpu_topology_get_nhwcpu(struct starpu_machine_config_s *config);
+unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 
 
 /* Bind the current thread on the CPU logically identified by "cpuid". The
 /* Bind the current thread on the CPU logically identified by "cpuid". The
  * logical ordering of the processors is either that of hwloc (if available),
  * logical ordering of the processors is either that of hwloc (if available),
  * or the ordering exposed by the OS. */
  * or the ordering exposed by the OS. */
-void _starpu_bind_thread_on_cpu(struct starpu_machine_config_s *config, unsigned cpuid);
+void _starpu_bind_thread_on_cpu(struct _starpu_machine_config *config, unsigned cpuid);
+
+struct _starpu_combined_worker;
+/* Bind the current thread on the set of CPUs for the given combined worker. */
+void _starpu_bind_thread_on_cpus(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED, struct _starpu_combined_worker *combined_worker);
 
 
 #endif // __TOPOLOGY_H__
 #endif // __TOPOLOGY_H__

+ 225 - 121
src/core/workers.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
  * Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
@@ -35,90 +35,140 @@
 /* acquire/release semantic for concurrent initialization/de-initialization */
 /* acquire/release semantic for concurrent initialization/de-initialization */
 static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t init_cond = PTHREAD_COND_INITIALIZER;
 static pthread_cond_t init_cond = PTHREAD_COND_INITIALIZER;
-
-static int init_count;
+static int init_count = 0;
 static enum { UNINITIALIZED, CHANGING, INITIALIZED } initialized = UNINITIALIZED;
 static enum { UNINITIALIZED, CHANGING, INITIALIZED } initialized = UNINITIALIZED;
 
 
 static pthread_key_t worker_key;
 static pthread_key_t worker_key;
 
 
-static struct starpu_machine_config_s config;
+static struct _starpu_machine_config config;
+
+int _starpu_is_initialized(void)
+{
+	return initialized == INITIALIZED;
+}
 
 
-struct starpu_machine_config_s *_starpu_get_machine_config(void)
+struct _starpu_machine_config *_starpu_get_machine_config(void)
 {
 {
 	return &config;
 	return &config;
 }
 }
 
 
+/* Makes sure that at least one of the workers of type <arch> can execute
+ * <task>*/
+static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
+						      enum starpu_archtype arch)
+{
+	int i;
+	int nworkers = starpu_worker_get_count_by_type(arch);
+	int workers[nworkers];
+	STARPU_ASSERT(nworkers != -EINVAL);
+	(void) starpu_worker_get_ids_by_type(arch, workers, nworkers);
+	for (i = 0; i < nworkers; i++)
+		if (task->cl->can_execute(workers[i], task, 0))
+			return 1;
+	return 0;
+}
+
 /* in case a task is submitted, we may check whether there exists a worker
 /* in case a task is submitted, we may check whether there exists a worker
    that may execute the task or not */
    that may execute the task or not */
-
-uint32_t _starpu_worker_exists(uint32_t task_mask)
+uint32_t _starpu_worker_exists(struct starpu_task *task)
 {
 {
-	return (task_mask & config.worker_mask);
-} 
+	if (!(task->cl->where & config.worker_mask))
+		return 0;
+
+	if (!task->cl->can_execute)
+		return 1;
 
 
-uint32_t _starpu_may_submit_cuda_task(void)
+#ifdef STARPU_USE_CPU
+	if ((task->cl->where & STARPU_CPU) &&
+	    _starpu_worker_exists_and_can_execute(task, STARPU_CPU_WORKER))
+		return 1;
+#endif
+#ifdef STARPU_USE_CUDA
+	if ((task->cl->where & STARPU_CUDA) &&
+	    _starpu_worker_exists_and_can_execute(task, STARPU_CUDA_WORKER))
+		return 1;
+#endif
+#ifdef STARPU_USE_OPENCL
+	if ((task->cl->where & STARPU_OPENCL) &&
+	    _starpu_worker_exists_and_can_execute(task, STARPU_OPENCL_WORKER))
+		return 1;
+#endif
+	return 0;
+}
+
+uint32_t _starpu_can_submit_cuda_task(void)
 {
 {
 	return (STARPU_CUDA & config.worker_mask);
 	return (STARPU_CUDA & config.worker_mask);
 }
 }
 
 
-uint32_t _starpu_may_submit_cpu_task(void)
+uint32_t _starpu_can_submit_cpu_task(void)
 {
 {
 	return (STARPU_CPU & config.worker_mask);
 	return (STARPU_CPU & config.worker_mask);
 }
 }
 
 
-uint32_t _starpu_may_submit_opencl_task(void)
+uint32_t _starpu_can_submit_opencl_task(void)
 {
 {
 	return (STARPU_OPENCL & config.worker_mask);
 	return (STARPU_OPENCL & config.worker_mask);
 }
 }
 
 
-static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct starpu_codelet_t *cl, unsigned nimpl)
+static int _starpu_can_use_nth_implementation(enum starpu_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
 {
 {
-	switch(arch) {
+	switch(arch)
+	{
 	case STARPU_CPU_WORKER:
 	case STARPU_CPU_WORKER:
-		return !(cl->cpu_func == STARPU_MULTIPLE_CPU_IMPLEMENTATIONS &&
-			cl->cpu_funcs[nimpl] == NULL);
+	{
+		starpu_cpu_func_t func = _starpu_task_get_cpu_nth_implementation(cl, nimpl);
+		return func != NULL;
+	}
 	case STARPU_CUDA_WORKER:
 	case STARPU_CUDA_WORKER:
-		return !(cl->cuda_func == STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS &&
-			cl->cuda_funcs[nimpl] == NULL);
+	{
+		starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, nimpl);
+		return func != NULL;
+	}
 	case STARPU_OPENCL_WORKER:
 	case STARPU_OPENCL_WORKER:
-		return !(cl->opencl_func == STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS &&
-			cl->opencl_funcs[nimpl] == NULL);
+	{
+		starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, nimpl);
+		return func != NULL;
+	}
 	case STARPU_GORDON_WORKER:
 	case STARPU_GORDON_WORKER:
-		return !(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS &&
-			cl->gordon_funcs[nimpl] == 0);
+	{
+		starpu_gordon_func_t func = _starpu_task_get_gordon_nth_implementation(cl, nimpl);
+		return func != 0;
+	}
 	default:
 	default:
-		return 0;
+		STARPU_ASSERT_MSG(0, "Unknown arch type");
 	}
 	}
+	return 0;
 }
 }
 
 
 
 
-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 {
 	/* TODO: check that the task operand sizes will fit on that device */
 	/* TODO: check that the task operand sizes will fit on that device */
-	/* TODO: call application-provided function for various cases like
-	 * double support, shared memory size limit, etc. */
-	return !!((task->cl->where & config.workers[workerid].worker_mask) &&
-		_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
+	return (task->cl->where & config.workers[workerid].worker_mask) &&
+		_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl) &&
+		(!task->cl->can_execute || task->cl->can_execute(workerid, task, nimpl));
 }
 }
 
 
 
 
 
 
-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 {
 	/* TODO: check that the task operand sizes will fit on that device */
 	/* TODO: check that the task operand sizes will fit on that device */
 	/* TODO: call application-provided function for various cases like
 	/* TODO: call application-provided function for various cases like
 	 * double support, shared memory size limit, etc. */
 	 * double support, shared memory size limit, etc. */
 
 
-	struct starpu_codelet_t *cl = task->cl;
+	struct starpu_codelet *cl = task->cl;
 	unsigned nworkers = config.topology.nworkers;
 	unsigned nworkers = config.topology.nworkers;
 
 
 	/* Is this a parallel worker ? */
 	/* Is this a parallel worker ? */
 	if (workerid < nworkers)
 	if (workerid < nworkers)
 	{
 	{
 		return !!((task->cl->where & config.workers[workerid].worker_mask) &&
 		return !!((task->cl->where & config.workers[workerid].worker_mask) &&
-				_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
+				_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
 	}
 	}
-	else {
+	else
+	{
 		if ((cl->type == STARPU_SPMD) || (cl->type == STARPU_FORKJOIN))
 		if ((cl->type == STARPU_SPMD) || (cl->type == STARPU_FORKJOIN))
 		{
 		{
 			/* TODO we should add other types of constraints */
 			/* TODO we should add other types of constraints */
@@ -126,7 +176,7 @@ int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_tas
 			/* Is the worker larger than requested ? */
 			/* Is the worker larger than requested ? */
 			int worker_size = (int)config.combined_workers[workerid - nworkers].worker_size;
 			int worker_size = (int)config.combined_workers[workerid - nworkers].worker_size;
 			return !!((worker_size <= task->cl->max_parallelism) &&
 			return !!((worker_size <= task->cl->max_parallelism) &&
-				_starpu_may_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
+				_starpu_can_use_nth_implementation(config.workers[workerid].arch, task->cl, nimpl));
 		}
 		}
 		else
 		else
 		{
 		{
@@ -141,11 +191,11 @@ int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_tas
  */
  */
 
 
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
-static unsigned gordon_inited = 0;	
-static struct starpu_worker_set_s gordon_worker_set;
+static unsigned gordon_inited = 0;
+static struct _starpu_worker_set gordon_worker_set;
 #endif
 #endif
 
 
-static void _starpu_init_worker_queue(struct starpu_worker_s *workerarg)
+static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 {
 {
 	pthread_cond_t *cond = &workerarg->sched_cond;
 	pthread_cond_t *cond = &workerarg->sched_cond;
 	pthread_mutex_t *mutex = &workerarg->sched_mutex;
 	pthread_mutex_t *mutex = &workerarg->sched_mutex;
@@ -155,7 +205,7 @@ static void _starpu_init_worker_queue(struct starpu_worker_s *workerarg)
 	_starpu_memory_node_register_condition(cond, mutex, memory_node);
 	_starpu_memory_node_register_condition(cond, mutex, memory_node);
 }
 }
 
 
-static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
+static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 {
 {
 	config->running = 1;
 	config->running = 1;
 
 
@@ -167,14 +217,14 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 	unsigned worker;
 	unsigned worker;
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
-		struct starpu_worker_s *workerarg = &config->workers[worker];
+		struct _starpu_worker *workerarg = &config->workers[worker];
 
 
 		workerarg->config = config;
 		workerarg->config = config;
 
 
 		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
 		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
 
 
-		PTHREAD_MUTEX_INIT(&workerarg->mutex, NULL);
-		PTHREAD_COND_INIT(&workerarg->ready_cond, NULL);
+		_STARPU_PTHREAD_MUTEX_INIT(&workerarg->mutex, NULL);
+		_STARPU_PTHREAD_COND_INIT(&workerarg->ready_cond, NULL);
 
 
 		workerarg->worker_size = 1;
 		workerarg->worker_size = 1;
 		workerarg->combined_workerid = workerarg->workerid;
 		workerarg->combined_workerid = workerarg->workerid;
@@ -184,30 +234,31 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 		/* we have a single local list */
 		/* we have a single local list */
 		/* afterwards there would be a mutex + cond for the list of each strategy */
 		/* afterwards there would be a mutex + cond for the list of each strategy */
 
 
-		PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
-		PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
+		_STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
+		_STARPU_PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
 
 
 		/* if some codelet's termination cannot be handled directly :
 		/* if some codelet's termination cannot be handled directly :
 		 * for instance in the Gordon driver, Gordon tasks' callbacks
 		 * for instance in the Gordon driver, Gordon tasks' callbacks
 		 * may be executed by another thread than that of the Gordon
 		 * may be executed by another thread than that of the Gordon
 		 * driver so that we cannot call the push_codelet_output method
 		 * driver so that we cannot call the push_codelet_output method
 		 * directly */
 		 * directly */
-		workerarg->terminated_jobs = starpu_job_list_new();
+		workerarg->terminated_jobs = _starpu_job_list_new();
 
 
 		starpu_task_list_init(&workerarg->local_tasks);
 		starpu_task_list_init(&workerarg->local_tasks);
-	
+
 		workerarg->status = STATUS_INITIALIZING;
 		workerarg->status = STATUS_INITIALIZING;
 
 
 		_STARPU_DEBUG("initialising worker %u\n", worker);
 		_STARPU_DEBUG("initialising worker %u\n", worker);
 
 
 		_starpu_init_worker_queue(workerarg);
 		_starpu_init_worker_queue(workerarg);
 
 
-		switch (workerarg->arch) {
+		switch (workerarg->arch)
+		{
 #ifdef STARPU_USE_CPU
 #ifdef STARPU_USE_CPU
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 				workerarg->set = NULL;
 				workerarg->set = NULL;
 				workerarg->worker_is_initialized = 0;
 				workerarg->worker_is_initialized = 0;
-				pthread_create(&workerarg->worker_thread, 
+				pthread_create(&workerarg->worker_thread,
 						NULL, _starpu_cpu_worker, workerarg);
 						NULL, _starpu_cpu_worker, workerarg);
 				break;
 				break;
 #endif
 #endif
@@ -215,7 +266,7 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
 				workerarg->set = NULL;
 				workerarg->set = NULL;
 				workerarg->worker_is_initialized = 0;
 				workerarg->worker_is_initialized = 0;
-				pthread_create(&workerarg->worker_thread, 
+				pthread_create(&workerarg->worker_thread,
 						NULL, _starpu_cuda_worker, workerarg);
 						NULL, _starpu_cuda_worker, workerarg);
 
 
 				break;
 				break;
@@ -224,34 +275,34 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 			case STARPU_OPENCL_WORKER:
 			case STARPU_OPENCL_WORKER:
 				workerarg->set = NULL;
 				workerarg->set = NULL;
 				workerarg->worker_is_initialized = 0;
 				workerarg->worker_is_initialized = 0;
-				pthread_create(&workerarg->worker_thread, 
+				pthread_create(&workerarg->worker_thread,
 						NULL, _starpu_opencl_worker, workerarg);
 						NULL, _starpu_opencl_worker, workerarg);
 
 
 				break;
 				break;
 #endif
 #endif
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 			case STARPU_GORDON_WORKER:
-				/* we will only launch gordon once, but it will handle 
+				/* we will only launch gordon once, but it will handle
 				 * the different SPU workers */
 				 * the different SPU workers */
 				if (!gordon_inited)
 				if (!gordon_inited)
 				{
 				{
-					gordon_worker_set.nworkers = config->ngordon_spus; 
+					gordon_worker_set.nworkers = config->ngordon_spus;
 					gordon_worker_set.workers = &config->workers[worker];
 					gordon_worker_set.workers = &config->workers[worker];
 
 
 					gordon_worker_set.set_is_initialized = 0;
 					gordon_worker_set.set_is_initialized = 0;
 
 
-					pthread_create(&gordon_worker_set.worker_thread, NULL, 
+					pthread_create(&gordon_worker_set.worker_thread, NULL,
 							_starpu_gordon_worker, &gordon_worker_set);
 							_starpu_gordon_worker, &gordon_worker_set);
 
 
-					PTHREAD_MUTEX_LOCK(&gordon_worker_set.mutex);
+					_STARPU_PTHREAD_MUTEX_LOCK(&gordon_worker_set.mutex);
 					while (!gordon_worker_set.set_is_initialized)
 					while (!gordon_worker_set.set_is_initialized)
-						PTHREAD_COND_WAIT(&gordon_worker_set.ready_cond,
+						_STARPU_PTHREAD_COND_WAIT(&gordon_worker_set.ready_cond,
 									&gordon_worker_set.mutex);
 									&gordon_worker_set.mutex);
-					PTHREAD_MUTEX_UNLOCK(&gordon_worker_set.mutex);
+					_STARPU_PTHREAD_MUTEX_UNLOCK(&gordon_worker_set.mutex);
 
 
 					gordon_inited = 1;
 					gordon_inited = 1;
 				}
 				}
-				
+
 				workerarg->set = &gordon_worker_set;
 				workerarg->set = &gordon_worker_set;
 				gordon_worker_set.joined = 0;
 				gordon_worker_set.joined = 0;
 				workerarg->worker_is_running = 1;
 				workerarg->worker_is_running = 1;
@@ -265,16 +316,17 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 
 
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
-		struct starpu_worker_s *workerarg = &config->workers[worker];
+		struct _starpu_worker *workerarg = &config->workers[worker];
 
 
-		switch (workerarg->arch) {
+		switch (workerarg->arch)
+		{
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
-			case STARPU_OPENCL_WORKER:			  
-				PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+			case STARPU_OPENCL_WORKER:
+				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
 				while (!workerarg->worker_is_initialized)
 				while (!workerarg->worker_is_initialized)
-					PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
-				PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
+				_STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 				break;
 				break;
 #ifdef STARPU_USE_GORDON
 #ifdef STARPU_USE_GORDON
 			case STARPU_GORDON_WORKER:
 			case STARPU_GORDON_WORKER:
@@ -289,14 +341,14 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 
 
 }
 }
 
 
-void _starpu_set_local_worker_key(struct starpu_worker_s *worker)
+void _starpu_set_local_worker_key(struct _starpu_worker *worker)
 {
 {
 	pthread_setspecific(worker_key, worker);
 	pthread_setspecific(worker_key, worker);
 }
 }
 
 
-struct starpu_worker_s *_starpu_get_local_worker_key(void)
+struct _starpu_worker *_starpu_get_local_worker_key(void)
 {
 {
-	return (struct starpu_worker_s *) pthread_getspecific(worker_key);
+	return (struct _starpu_worker *) pthread_getspecific(worker_key);
 }
 }
 
 
 /* Initialize the starpu_conf with default values */
 /* Initialize the starpu_conf with default values */
@@ -329,19 +381,49 @@ int starpu_init(struct starpu_conf *user_conf)
 {
 {
 	int ret;
 	int ret;
 
 
-	PTHREAD_MUTEX_LOCK(&init_mutex);
+#ifdef __GNUC__
+#ifndef __OPTIMIZE__
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-debug (-O0), and is thus not optimized\n");
+#endif
+#endif
+#if 0
+#ifndef STARPU_NO_ASSERT
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured without --enable-fast\n");
+#endif
+#endif
+#ifdef STARPU_MEMORY_STATUS
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-memory-status, which slows down a bit\n");
+#endif
+#ifdef STARPU_VERBOSE
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-verbose, which slows down a bit\n");
+#endif
+#ifdef STARPU_USE_FXT
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --with-fxt, which slows down a bit\n");
+#endif
+#ifdef STARPU_PERF_DEBUG
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-perf-debug, which slows down a bit\n");
+#endif
+#ifdef STARPU_MODEL_DEBUG
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-model-debug, which slows down a bit\n");
+#endif
+#ifdef STARPU_DATA_STATS
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-stats, which slows down a bit\n");
+#endif
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	while (initialized == CHANGING)
 	while (initialized == CHANGING)
 		/* Wait for the other one changing it */
 		/* Wait for the other one changing it */
-		PTHREAD_COND_WAIT(&init_cond, &init_mutex);
+		_STARPU_PTHREAD_COND_WAIT(&init_cond, &init_mutex);
 	init_count++;
 	init_count++;
-	if (initialized == INITIALIZED) {
-	  /* He initialized it, don't do it again, and let the others get the mutex */
-	  PTHREAD_MUTEX_UNLOCK(&init_mutex);
-	  return 0;
-	  }
+	if (initialized == INITIALIZED)
+	{
+		/* He initialized it, don't do it again, and let the others get the mutex */
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
+		return 0;
+	}
 	/* initialized == UNINITIALIZED */
 	/* initialized == UNINITIALIZED */
 	initialized = CHANGING;
 	initialized = CHANGING;
-	PTHREAD_MUTEX_UNLOCK(&init_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
 
 #ifdef __MINGW32__
 #ifdef __MINGW32__
 	WSADATA wsadata;
 	WSADATA wsadata;
@@ -351,7 +433,7 @@ int starpu_init(struct starpu_conf *user_conf)
 	srand(2008);
 	srand(2008);
 
 
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
-		_starpu_start_fxt_profiling();
+	_starpu_start_fxt_profiling();
 #endif
 #endif
 
 
 	_starpu_open_debug_logfile();
 	_starpu_open_debug_logfile();
@@ -370,21 +452,21 @@ int starpu_init(struct starpu_conf *user_conf)
 
 
 	_starpu_init_all_sched_ctxs(&config);
 	_starpu_init_all_sched_ctxs(&config);
 	ret = _starpu_build_topology(&config);
 	ret = _starpu_build_topology(&config);
-	if (ret) {
-		PTHREAD_MUTEX_LOCK(&init_mutex);
+	if (ret)
+	{
+		_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 		init_count--;
 		init_count--;
 		initialized = UNINITIALIZED;
 		initialized = UNINITIALIZED;
 		/* Let somebody else try to do it */
 		/* Let somebody else try to do it */
-		PTHREAD_COND_SIGNAL(&init_cond);
-		PTHREAD_MUTEX_UNLOCK(&init_mutex);
+		_STARPU_PTHREAD_COND_SIGNAL(&init_cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 		return ret;
 		return ret;
 	}
 	}
 
 
 	/* We need to store the current task handled by the different
 	/* We need to store the current task handled by the different
 	 * threads */
 	 * threads */
-	_starpu_initialize_current_task_key();	
+	_starpu_initialize_current_task_key();
 
 
-	/* initialize the scheduling policy */
 
 
 	struct starpu_sched_ctx *sched_ctx;
 	struct starpu_sched_ctx *sched_ctx;
 	if(user_conf == NULL)
 	if(user_conf == NULL)
@@ -398,20 +480,21 @@ int starpu_init(struct starpu_conf *user_conf)
 	/* Launch "basic" workers (ie. non-combined workers) */
 	/* Launch "basic" workers (ie. non-combined workers) */
 	_starpu_launch_drivers(&config);
 	_starpu_launch_drivers(&config);
 
 
-	PTHREAD_MUTEX_LOCK(&init_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	initialized = INITIALIZED;
 	initialized = INITIALIZED;
 	/* Tell everybody that we initialized */
 	/* Tell everybody that we initialized */
-	PTHREAD_COND_BROADCAST(&init_cond);
-	PTHREAD_MUTEX_UNLOCK(&init_mutex);
+	_STARPU_PTHREAD_COND_BROADCAST(&init_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
 
+	_STARPU_DEBUG("Initialisation finished\n");
 	return 0;
 	return 0;
 }
 }
 
 
 /*
 /*
- * Handle runtime termination 
+ * Handle runtime termination
  */
  */
 
 
-static void _starpu_terminate_workers(struct starpu_machine_config_s *config)
+static void _starpu_terminate_workers(struct _starpu_machine_config *config)
 {
 {
 	int status STARPU_ATTRIBUTE_UNUSED;
 	int status STARPU_ATTRIBUTE_UNUSED;
 	unsigned workerid;
 	unsigned workerid;
@@ -419,21 +502,24 @@ static void _starpu_terminate_workers(struct starpu_machine_config_s *config)
 	for (workerid = 0; workerid < config->topology.nworkers; workerid++)
 	for (workerid = 0; workerid < config->topology.nworkers; workerid++)
 	{
 	{
 		starpu_wake_all_blocked_workers();
 		starpu_wake_all_blocked_workers();
-		
+
 		_STARPU_DEBUG("wait for worker %u\n", workerid);
 		_STARPU_DEBUG("wait for worker %u\n", workerid);
 
 
-		struct starpu_worker_set_s *set = config->workers[workerid].set;
-		struct starpu_worker_s *worker = &config->workers[workerid];
+		struct _starpu_worker_set *set = config->workers[workerid].set;
+		struct _starpu_worker *worker = &config->workers[workerid];
 
 
 		/* in case StarPU termination code is called from a callback,
 		/* in case StarPU termination code is called from a callback,
  		 * we have to check if pthread_self() is the worker itself */
  		 * we have to check if pthread_self() is the worker itself */
-		if (set){ 
-			if (!set->joined) {
+		if (set)
+		{
+			if (!set->joined)
+			{
 				if (!pthread_equal(pthread_self(), set->worker_thread))
 				if (!pthread_equal(pthread_self(), set->worker_thread))
 				{
 				{
 					status = pthread_join(set->worker_thread, NULL);
 					status = pthread_join(set->worker_thread, NULL);
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
-					if (status) {
+					if (status)
+					{
 						_STARPU_DEBUG("pthread_join -> %d\n", status);
 						_STARPU_DEBUG("pthread_join -> %d\n", status);
                                         }
                                         }
 #endif
 #endif
@@ -442,12 +528,14 @@ static void _starpu_terminate_workers(struct starpu_machine_config_s *config)
 				set->joined = 1;
 				set->joined = 1;
 			}
 			}
 		}
 		}
-		else {
+		else
+		{
 			if (!pthread_equal(pthread_self(), worker->worker_thread))
 			if (!pthread_equal(pthread_self(), worker->worker_thread))
 			{
 			{
 				status = pthread_join(worker->worker_thread, NULL);
 				status = pthread_join(worker->worker_thread, NULL);
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
-				if (status) {
+				if (status)
+				{
 					_STARPU_DEBUG("pthread_join -> %d\n", status);
 					_STARPU_DEBUG("pthread_join -> %d\n", status);
                                 }
                                 }
 #endif
 #endif
@@ -455,12 +543,14 @@ static void _starpu_terminate_workers(struct starpu_machine_config_s *config)
 		}
 		}
 
 
 		STARPU_ASSERT(starpu_task_list_empty(&worker->local_tasks));
 		STARPU_ASSERT(starpu_task_list_empty(&worker->local_tasks));
-		starpu_job_list_delete(worker->terminated_jobs);
+		_starpu_job_list_delete(worker->terminated_jobs);
 	}
 	}
 }
 }
 
 
 unsigned _starpu_machine_is_running(void)
 unsigned _starpu_machine_is_running(void)
 {
 {
+	/* running is just protected by a memory barrier */
+	STARPU_SYNCHRONIZE();
 	return config.running;
 	return config.running;
 }
 }
 
 
@@ -484,26 +574,32 @@ unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
 #endif
 #endif
 }
 }
 
 
-static void _starpu_kill_all_workers(struct starpu_machine_config_s *config)
+static void _starpu_kill_all_workers(struct _starpu_machine_config *config)
 {
 {
 	/* set the flag which will tell workers to stop */
 	/* set the flag which will tell workers to stop */
 	config->running = 0;
 	config->running = 0;
+	/* running is just protected by a memory barrier */
+	STARPU_SYNCHRONIZE();
 	starpu_wake_all_blocked_workers();
 	starpu_wake_all_blocked_workers();
 }
 }
 
 
 void starpu_shutdown(void)
 void starpu_shutdown(void)
 {
 {
 	const char *stats;
 	const char *stats;
-	PTHREAD_MUTEX_LOCK(&init_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	init_count--;
 	init_count--;
-	if (init_count){
-		PTHREAD_MUTEX_UNLOCK(&init_mutex);
-		/* Still somebody needing StarPU, don't deinitialize */
+	if (init_count)
+	{
+		_STARPU_DEBUG("Still somebody needing StarPU, don't deinitialize\n");
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 		return;
 		return;
 	}
 	}
+
 	/* We're last */
 	/* We're last */
 	initialized = CHANGING;
 	initialized = CHANGING;
-	PTHREAD_MUTEX_UNLOCK(&init_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
+
+	starpu_task_wait_for_no_ready();
 
 
 	_starpu_display_msi_stats();
 	_starpu_display_msi_stats();
 	_starpu_display_alloc_cache_stats();
 	_starpu_display_alloc_cache_stats();
@@ -511,6 +607,11 @@ void starpu_shutdown(void)
 	/* tell all workers to shutdown */
 	/* tell all workers to shutdown */
 	_starpu_kill_all_workers(&config);
 	_starpu_kill_all_workers(&config);
 
 
+#ifdef STARPU_MEMORY_STATUS
+	if ((stats = getenv("STARPU_MEMORY_STATS")) && atoi(stats))
+		_starpu_display_data_stats();
+#endif
+
 #ifdef STARPU_DATA_STATS
 #ifdef STARPU_DATA_STATS
 	_starpu_display_comm_amounts();
 	_starpu_display_comm_amounts();
 #endif
 #endif
@@ -535,13 +636,18 @@ void starpu_shutdown(void)
 
 
 	_starpu_data_interface_shutdown();
 	_starpu_data_interface_shutdown();
 
 
+	/* Drop all remaining tags */
+	_starpu_tag_clear();
+
 	_starpu_close_debug_logfile();
 	_starpu_close_debug_logfile();
 
 
-	PTHREAD_MUTEX_LOCK(&init_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	initialized = UNINITIALIZED;
 	initialized = UNINITIALIZED;
 	/* Let someone else that wants to initialize it again do it */
 	/* Let someone else that wants to initialize it again do it */
-	PTHREAD_COND_SIGNAL(&init_cond);
-	PTHREAD_MUTEX_UNLOCK(&init_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&init_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
+
+	_STARPU_DEBUG("Shutdown finished\n");
 }
 }
 
 
 unsigned starpu_worker_get_count(void)
 unsigned starpu_worker_get_count(void)
@@ -602,14 +708,15 @@ unsigned starpu_spu_worker_get_count(void)
  * that is not controlled by StarPU, starpu_worker_get_id returns -1. */
  * that is not controlled by StarPU, starpu_worker_get_id returns -1. */
 int starpu_worker_get_id(void)
 int starpu_worker_get_id(void)
 {
 {
-	struct starpu_worker_s * worker;
+	struct _starpu_worker * worker;
 
 
 	worker = _starpu_get_local_worker_key();
 	worker = _starpu_get_local_worker_key();
 	if (worker)
 	if (worker)
 	{
 	{
 		return worker->workerid;
 		return worker->workerid;
 	}
 	}
-	else {
+	else
+	{
 		/* there is no worker associated to that thread, perhaps it is
 		/* there is no worker associated to that thread, perhaps it is
 		 * a thread from the application or this is some SPU worker */
 		 * a thread from the application or this is some SPU worker */
 		return -1;
 		return -1;
@@ -618,14 +725,15 @@ int starpu_worker_get_id(void)
 
 
 int starpu_combined_worker_get_id(void)
 int starpu_combined_worker_get_id(void)
 {
 {
-	struct starpu_worker_s *worker;
+	struct _starpu_worker *worker;
 
 
 	worker = _starpu_get_local_worker_key();
 	worker = _starpu_get_local_worker_key();
 	if (worker)
 	if (worker)
 	{
 	{
 		return worker->combined_workerid;
 		return worker->combined_workerid;
 	}
 	}
-	else {
+	else
+	{
 		/* there is no worker associated to that thread, perhaps it is
 		/* there is no worker associated to that thread, perhaps it is
 		 * a thread from the application or this is some SPU worker */
 		 * a thread from the application or this is some SPU worker */
 		return -1;
 		return -1;
@@ -634,14 +742,15 @@ int starpu_combined_worker_get_id(void)
 
 
 int starpu_combined_worker_get_size(void)
 int starpu_combined_worker_get_size(void)
 {
 {
-	struct starpu_worker_s *worker;
+	struct _starpu_worker *worker;
 
 
 	worker = _starpu_get_local_worker_key();
 	worker = _starpu_get_local_worker_key();
 	if (worker)
 	if (worker)
 	{
 	{
 		return worker->worker_size;
 		return worker->worker_size;
 	}
 	}
-	else {
+	else
+	{
 		/* there is no worker associated to that thread, perhaps it is
 		/* there is no worker associated to that thread, perhaps it is
 		 * a thread from the application or this is some SPU worker */
 		 * a thread from the application or this is some SPU worker */
 		return -1;
 		return -1;
@@ -650,14 +759,15 @@ int starpu_combined_worker_get_size(void)
 
 
 int starpu_combined_worker_get_rank(void)
 int starpu_combined_worker_get_rank(void)
 {
 {
-	struct starpu_worker_s *worker;
+	struct _starpu_worker *worker;
 
 
 	worker = _starpu_get_local_worker_key();
 	worker = _starpu_get_local_worker_key();
 	if (worker)
 	if (worker)
 	{
 	{
 		return worker->current_rank;
 		return worker->current_rank;
 	}
 	}
-	else {
+	else
+	{
 		/* there is no worker associated to that thread, perhaps it is
 		/* there is no worker associated to that thread, perhaps it is
 		 * a thread from the application or this is some SPU worker */
 		 * a thread from the application or this is some SPU worker */
 		return -1;
 		return -1;
@@ -669,18 +779,12 @@ int starpu_worker_get_devid(int id)
 	return config.workers[id].devid;
 	return config.workers[id].devid;
 }
 }
 
 
-struct starpu_worker_s *_starpu_get_worker_struct(unsigned id)
+struct _starpu_worker *_starpu_get_worker_struct(unsigned id)
 {
 {
 	return &config.workers[id];
 	return &config.workers[id];
 }
 }
 
 
-struct starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
-{
-	STARPU_ASSERT(id >= 0 && id <= STARPU_NMAX_SCHED_CTXS);
-	return &config.sched_ctxs[id];
-}
-
-struct starpu_combined_worker_s *_starpu_get_combined_worker_struct(unsigned id)
+struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id)
 {
 {
 	unsigned basic_worker_count = starpu_worker_get_count();
 	unsigned basic_worker_count = starpu_worker_get_count();
 
 
@@ -723,14 +827,14 @@ void starpu_worker_get_name(int id, char *dst, size_t maxlen)
 }
 }
 
 
 /* Retrieve the status which indicates what the worker is currently doing. */
 /* Retrieve the status which indicates what the worker is currently doing. */
-starpu_worker_status _starpu_worker_get_status(int workerid)
+enum _starpu_worker_status _starpu_worker_get_status(int workerid)
 {
 {
 	return config.workers[workerid].status;
 	return config.workers[workerid].status;
 }
 }
 
 
 /* Change the status of the worker which indicates what the worker is currently
 /* Change the status of the worker which indicates what the worker is currently
  * doing (eg. executing a callback). */
  * doing (eg. executing a callback). */
-void _starpu_worker_set_status(int workerid, starpu_worker_status status)
+void _starpu_worker_set_status(int workerid, enum _starpu_worker_status status)
 {
 {
 	config.workers[workerid].status = status;
 	config.workers[workerid].status = status;
 }
 }

+ 37 - 32
src/core/workers.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  * Copyright (C) 2011  INRIA
  *
  *
@@ -54,15 +54,16 @@
 
 
 #include <starpu_parameters.h>
 #include <starpu_parameters.h>
 
 
-struct starpu_worker_s {
-	struct starpu_machine_config_s *config;
+struct _starpu_worker
+{
+	struct _starpu_machine_config *config;
         pthread_mutex_t mutex;
         pthread_mutex_t mutex;
 	enum starpu_archtype arch; /* what is the type of worker ? */
 	enum starpu_archtype arch; /* what is the type of worker ? */
 	uint32_t worker_mask; /* what is the type of worker ? */
 	uint32_t worker_mask; /* what is the type of worker ? */
 	enum starpu_perf_archtype perf_arch; /* in case there are different models of the same arch */
 	enum starpu_perf_archtype perf_arch; /* in case there are different models of the same arch */
 	pthread_t worker_thread; /* the thread which runs the worker */
 	pthread_t worker_thread; /* the thread which runs the worker */
 	int devid; /* which cpu/gpu/etc is controlled by the workker ? */
 	int devid; /* which cpu/gpu/etc is controlled by the workker ? */
-	int bindid; /* which cpu is the driver bound to ? */
+	int bindid; /* which cpu is the driver bound to ? (logical index) */
 	int workerid; /* uniquely identify the worker among all processing units types */
 	int workerid; /* uniquely identify the worker among all processing units types */
 	int combined_workerid; /* combined worker currently using this worker */
 	int combined_workerid; /* combined worker currently using this worker */
 	int current_rank; /* current rank in case the worker is used in a parallel fashion */
 	int current_rank; /* current rank in case the worker is used in a parallel fashion */
@@ -72,11 +73,12 @@ struct starpu_worker_s {
 	pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
 	pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
 	pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
-	struct starpu_worker_set_s *set; /* in case this worker belongs to a set */
-	struct starpu_job_list_s *terminated_jobs; /* list of pending jobs which were executed */
+	struct starpu_task *current_task; /* task currently executed by this worker */
+	struct _starpu_worker_set *set; /* in case this worker belongs to a set */
+	struct _starpu_job_list *terminated_jobs; /* list of pending jobs which were executed */
 	unsigned worker_is_running;
 	unsigned worker_is_running;
 	unsigned worker_is_initialized;
 	unsigned worker_is_initialized;
-	starpu_worker_status status; /* what is the worker doing now ? (eg. CALLBACK) */
+	enum _starpu_worker_status status; /* what is the worker doing now ? (eg. CALLBACK) */
 	char name[48];
 	char name[48];
 	char short_name[10];
 	char short_name[10];
 
 
@@ -96,7 +98,8 @@ struct starpu_worker_s {
 #endif
 #endif
 };
 };
 
 
-struct starpu_combined_worker_s {
+struct _starpu_combined_worker
+{
 	enum starpu_perf_archtype perf_arch; /* in case there are different models of the same arch */
 	enum starpu_perf_archtype perf_arch; /* in case there are different models of the same arch */
 	uint32_t worker_mask; /* what is the type of workers ? */
 	uint32_t worker_mask; /* what is the type of workers ? */
 	int worker_size;
 	int worker_size;
@@ -111,22 +114,23 @@ struct starpu_combined_worker_s {
 #endif
 #endif
 };
 };
 
 
-/* in case a single CPU worker may control multiple 
+/* in case a single CPU worker may control multiple
  * accelerators (eg. Gordon for n SPUs) */
  * accelerators (eg. Gordon for n SPUs) */
-struct starpu_worker_set_s {
+struct _starpu_worker_set
+{
         pthread_mutex_t mutex;
         pthread_mutex_t mutex;
 	pthread_t worker_thread; /* the thread which runs the worker */
 	pthread_t worker_thread; /* the thread which runs the worker */
 	unsigned nworkers;
 	unsigned nworkers;
 	unsigned joined; /* only one thread may call pthread_join*/
 	unsigned joined; /* only one thread may call pthread_join*/
 	void *retval;
 	void *retval;
-	struct starpu_worker_s *workers;
+	struct _starpu_worker *workers;
         pthread_cond_t ready_cond; /* indicate when the set is ready */
         pthread_cond_t ready_cond; /* indicate when the set is ready */
 	unsigned set_is_initialized;
 	unsigned set_is_initialized;
 };
 };
 
 
-struct starpu_machine_config_s {
-
-	struct starpu_machine_topology_s topology;
+struct _starpu_machine_config
+{
+	struct starpu_machine_topology topology;
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 	int cpu_depth;
 	int cpu_depth;
@@ -134,20 +138,20 @@ struct starpu_machine_config_s {
 
 
 	/* Where to bind workers ? */
 	/* Where to bind workers ? */
 	int current_bindid;
 	int current_bindid;
-	
+
 	/* Which GPU(s) do we use for CUDA ? */
 	/* Which GPU(s) do we use for CUDA ? */
 	int current_cuda_gpuid;
 	int current_cuda_gpuid;
 
 
 	/* Which GPU(s) do we use for OpenCL ? */
 	/* Which GPU(s) do we use for OpenCL ? */
 	int current_opencl_gpuid;
 	int current_opencl_gpuid;
-	
+
 	/* Basic workers : each of this worker is running its own driver and
 	/* Basic workers : each of this worker is running its own driver and
 	 * can be combined with other basic workers. */
 	 * can be combined with other basic workers. */
-	struct starpu_worker_s workers[STARPU_NMAXWORKERS];
+	struct _starpu_worker workers[STARPU_NMAXWORKERS];
 
 
 	/* Combined workers: these worker are a combination of basic workers
 	/* Combined workers: these worker are a combination of basic workers
 	 * that can run parallel tasks together. */
 	 * that can run parallel tasks together. */
-	struct starpu_combined_worker_s combined_workers[STARPU_NMAX_COMBINEDWORKERS];
+	struct _starpu_combined_worker combined_workers[STARPU_NMAX_COMBINEDWORKERS];
 
 
 	/* This bitmask indicates which kinds of worker are available. For
 	/* This bitmask indicates which kinds of worker are available. For
 	 * instance it is possible to test if there is a CUDA worker with
 	 * instance it is possible to test if there is a CUDA worker with
@@ -169,16 +173,16 @@ struct starpu_machine_config_s {
 unsigned _starpu_machine_is_running(void);
 unsigned _starpu_machine_is_running(void);
 
 
 /* Check if there is a worker that may execute the task. */
 /* Check if there is a worker that may execute the task. */
-uint32_t _starpu_worker_exists(uint32_t task_mask);
+uint32_t _starpu_worker_exists(struct starpu_task *);
 
 
 /* Is there a worker that can execute CUDA code ? */
 /* Is there a worker that can execute CUDA code ? */
-uint32_t _starpu_may_submit_cuda_task(void);
+uint32_t _starpu_can_submit_cuda_task(void);
 
 
 /* Is there a worker that can execute CPU code ? */
 /* Is there a worker that can execute CPU code ? */
-uint32_t _starpu_may_submit_cpu_task(void);
+uint32_t _starpu_can_submit_cpu_task(void);
 
 
 /* Is there a worker that can execute OpenCL code ? */
 /* Is there a worker that can execute OpenCL code ? */
-uint32_t _starpu_may_submit_opencl_task(void);
+uint32_t _starpu_can_submit_opencl_task(void);
 
 
 /* Check whether there is anything that the worker should do instead of
 /* Check whether there is anything that the worker should do instead of
  * sleeping (waiting on something to happen). */
  * sleeping (waiting on something to happen). */
@@ -189,36 +193,37 @@ unsigned _starpu_worker_can_block(unsigned memnode);
  * */
  * */
 void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *mutex);
 void _starpu_block_worker(int workerid, pthread_cond_t *cond, pthread_mutex_t *mutex);
 
 
-/* The starpu_worker_s structure describes all the state of a StarPU worker.
+/* The _starpu_worker structure describes all the state of a StarPU worker.
  * This function sets the pthread key which stores a pointer to this structure.
  * This function sets the pthread key which stores a pointer to this structure.
  * */
  * */
-void _starpu_set_local_worker_key(struct starpu_worker_s *worker);
+void _starpu_set_local_worker_key(struct _starpu_worker *worker);
 
 
-/* Returns the starpu_worker_s structure that describes the state of the
+/* Returns the _starpu_worker structure that describes the state of the
  * current worker. */
  * current worker. */
-struct starpu_worker_s *_starpu_get_local_worker_key(void);
+struct _starpu_worker *_starpu_get_local_worker_key(void);
 
 
-/* Returns the starpu_worker_s structure that describes the state of the
+/* Returns the _starpu_worker structure that describes the state of the
  * specified worker. */
  * specified worker. */
-struct starpu_worker_s *_starpu_get_worker_struct(unsigned id);
+struct _starpu_worker *_starpu_get_worker_struct(unsigned id);
 
 
 /* Returns the starpu_sched_ctx structure that descriebes the state of the 
 /* Returns the starpu_sched_ctx structure that descriebes the state of the 
  * specified ctx */
  * specified ctx */
 struct starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
 struct starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
 
 
+struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id);
 
 
-struct starpu_combined_worker_s *_starpu_get_combined_worker_struct(unsigned id);
+int _starpu_is_initialized(void);
 
 
 /* Returns the structure that describes the overall machine configuration (eg.
 /* Returns the structure that describes the overall machine configuration (eg.
  * all workers and topology). */
  * all workers and topology). */
-struct starpu_machine_config_s *_starpu_get_machine_config(void);
+struct _starpu_machine_config *_starpu_get_machine_config(void);
 
 
 /* Retrieve the status which indicates what the worker is currently doing. */
 /* Retrieve the status which indicates what the worker is currently doing. */
-starpu_worker_status _starpu_worker_get_status(int workerid);
+enum _starpu_worker_status _starpu_worker_get_status(int workerid);
 
 
 /* Change the status of the worker which indicates what the worker is currently
 /* Change the status of the worker which indicates what the worker is currently
  * doing (eg. executing a callback). */
  * doing (eg. executing a callback). */
-void _starpu_worker_set_status(int workerid, starpu_worker_status status);
+void _starpu_worker_set_status(int workerid, enum _starpu_worker_status status);
 
 
 /* TODO move */
 /* TODO move */
 unsigned _starpu_execute_registered_progression_hooks(void);
 unsigned _starpu_execute_registered_progression_hooks(void);