13 年之前 · 131269917d
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/src/common/.gitignore
+++ b/src/common/.gitignore
@@ -0,0 +1,3 @@
 
				+/stamp-h1
			
 
				+/config.h
			
 
				+/config.h.in
			
--- a/src/common/barrier.c
+++ b/src/common/barrier.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010,2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,20 +15,46 @@
 
				  */
			
 
				 
			
 
				 #include <common/barrier.h>
			
 
				+#include <common/utils.h>
			
 
				 
			
 
				 int _starpu_barrier_init(_starpu_barrier_t *barrier, int count)
			
 
				 {
			
 
				 	barrier->count = count;
			
 
				-	barrier->reached = 0;
			
 
				-	pthread_mutex_init(&barrier->mutex,NULL);
			
 
				-	pthread_cond_init(&barrier->cond,NULL);
			
 
				+	barrier->reached_start = 0;
			
 
				+	barrier->reached_exit = 0;
			
 
				+	PTHREAD_MUTEX_INIT(&barrier->mutex, NULL);
			
 
				+	PTHREAD_MUTEX_INIT(&barrier->mutex_exit, NULL);
			
 
				+	PTHREAD_COND_INIT(&barrier->cond, NULL);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static
			
 
				+int _starpu_barrier_test(_starpu_barrier_t *barrier)
			
 
				+{
			
 
				+    /*
			
 
				+     * Check whether any threads are known to be waiting; report
			
 
				+     * "BUSY" if so.
			
 
				+     */
			
 
				+        PTHREAD_MUTEX_LOCK(&barrier->mutex_exit);
			
 
				+        if (barrier->reached_exit != barrier->count) {
			
 
				+                PTHREAD_MUTEX_UNLOCK(&barrier->mutex_exit);
			
 
				+                return EBUSY;
			
 
				+        }
			
 
				+        PTHREAD_MUTEX_UNLOCK(&barrier->mutex_exit);
			
 
				+        return 0;
			
 
				+}
			
 
				+
			
 
				 int _starpu_barrier_destroy(_starpu_barrier_t *barrier)
			
 
				 {
			
 
				-	pthread_mutex_destroy(&barrier->mutex);
			
 
				-	pthread_cond_destroy(&barrier->cond);
			
 
				+	int ret = _starpu_barrier_test(barrier);
			
 
				+	while (ret == EBUSY) {
			
 
				+		ret = _starpu_barrier_test(barrier);
			
 
				+	}
			
 
				+	_STARPU_DEBUG("reached_exit %d\n", barrier->reached_exit);
			
 
				+
			
 
				+	PTHREAD_MUTEX_DESTROY(&barrier->mutex);
			
 
				+	PTHREAD_MUTEX_DESTROY(&barrier->mutex_exit);
			
 
				+	PTHREAD_COND_DESTROY(&barrier->cond);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -36,18 +62,26 @@ int _starpu_barrier_wait(_starpu_barrier_t *barrier)
 
				 {
			
 
				 	int ret=0;
			
 
				 
			
 
				-	pthread_mutex_lock(&barrier->mutex);
			
 
				-	barrier->reached++;
			
 
				-	if (barrier->reached == barrier->count)
			
 
				+        // Wait until all threads enter the barrier
			
 
				+	PTHREAD_MUTEX_LOCK(&barrier->mutex);
			
 
				+	barrier->reached_exit=0;
			
 
				+	barrier->reached_start++;
			
 
				+	if (barrier->reached_start == barrier->count)
			
 
				 	{
			
 
				-		barrier->reached = 0;
			
 
				-		pthread_cond_broadcast(&barrier->cond);
			
 
				+		barrier->reached_start = 0;
			
 
				+		PTHREAD_COND_BROADCAST(&barrier->cond);
			
 
				 		ret = PTHREAD_BARRIER_SERIAL_THREAD;
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		pthread_cond_wait(&barrier->cond,&barrier->mutex);
			
 
				+                PTHREAD_COND_WAIT(&barrier->cond,&barrier->mutex);
			
 
				 	}
			
 
				-	pthread_mutex_unlock(&barrier->mutex);
			
 
				+	PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
			
 
				+
			
 
				+        // Count number of threads that exit the barrier
			
 
				+	PTHREAD_MUTEX_LOCK(&barrier->mutex_exit);
			
 
				+	barrier->reached_exit ++;
			
 
				+	PTHREAD_MUTEX_UNLOCK(&barrier->mutex_exit);
			
 
				+
			
 
				 	return ret;
			
 
				 }
			
--- a/src/common/barrier.h
+++ b/src/common/barrier.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,8 +21,10 @@
 
				 
			
 
				 typedef struct {
			
 
				 	int count;
			
 
				-	int reached;
			
 
				+	int reached_start;
			
 
				+	int reached_exit;
			
 
				 	pthread_mutex_t mutex;
			
 
				+	pthread_mutex_t mutex_exit;
			
 
				 	pthread_cond_t cond;
			
 
				 } _starpu_barrier_t;
			
 
				 
			
--- a/src/common/fxt.c
+++ b/src/common/fxt.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,8 +17,11 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				-#ifdef STARPU_USE_FXT
			
 
				+#include <common/utils.h>
			
 
				+#include <starpu_util.h>
			
 
				+#include <starpu_profiling.h>
			
 
				 
			
 
				+#ifdef STARPU_USE_FXT
			
 
				 #include <common/fxt.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_WINDOWS
			
@@ -32,39 +35,44 @@ static int fxt_started = 0;
 
				 
			
 
				 static int written = 0;
			
 
				 
			
 
				-static void profile_set_tracefile(char *fmt, ...)
			
 
				+static int id;
			
 
				+
			
 
				+static void _profile_set_tracefile(void *last, ...)
			
 
				 {
			
 
				 	va_list vl;
			
 
				 	char *user;
			
 
				-	
			
 
				-	va_start(vl, fmt);
			
 
				-	vsprintf(PROF_FILE_USER, fmt, vl);
			
 
				+
			
 
				+        char *fxt_prefix = getenv("STARPU_FXT_PREFIX");
			
 
				+        if (!fxt_prefix)
			
 
				+			fxt_prefix = "/tmp/";
			
 
				+
			
 
				+	va_start(vl, last);
			
 
				+	vsprintf(PROF_FILE_USER, fxt_prefix, vl);
			
 
				 	va_end(vl);
			
 
				 
			
 
				 	user = getenv("USER");
			
 
				 	if (!user)
			
 
				 		user = "";
			
 
				 
			
 
				-	int pid = getpid();
			
 
				-
			
 
				 	char suffix[128];
			
 
				-	snprintf(suffix, 128, "prof_file_%s_%d", user, pid);
			
 
				+	snprintf(suffix, 128, "prof_file_%s_%d", user, id);
			
 
				 
			
 
				 	strcat(PROF_FILE_USER, suffix);
			
 
				 }
			
 
				 
			
 
				+void starpu_set_profiling_id(int new_id) {
			
 
				+        _STARPU_DEBUG("Set id to <%d>\n", new_id);
			
 
				+	id = new_id;
			
 
				+        _profile_set_tracefile(NULL);
			
 
				+}
			
 
				+
			
 
				 void _starpu_start_fxt_profiling(void)
			
 
				 {
			
 
				 	unsigned threadid;
			
 
				 
			
 
				 	if (!fxt_started) {
			
 
				 		fxt_started = 1;
			
 
				-
			
 
				-		char *fxt_prefix = getenv("STARPU_FXT_PREFIX");
			
 
				-		if (!fxt_prefix)
			
 
				-			fxt_prefix = "/tmp/";
			
 
				-
			
 
				-		profile_set_tracefile(fxt_prefix);
			
 
				+		_profile_set_tracefile(NULL);
			
 
				 	}
			
 
				 
			
 
				 	threadid = syscall(SYS_gettid);
			
@@ -81,6 +89,23 @@ void _starpu_start_fxt_profiling(void)
 
				 	return;
			
 
				 }
			
 
				 
			
 
				+static void generate_paje_trace(char *input_fxt_filename, char *output_paje_filename)
			
 
				+{
			
 
				+	/* We take default options */
			
 
				+	struct starpu_fxt_options options;
			
 
				+	starpu_fxt_options_init(&options);
			
 
				+
			
 
				+	/* TODO parse some STARPU_GENERATE_TRACE_OPTIONS env variable */
			
 
				+
			
 
				+	options.ninputfiles = 1;
			
 
				+	options.filenames[0] = input_fxt_filename;
			
 
				+	options.out_paje_path = output_paje_filename;
			
 
				+	options.file_prefix = "";
			
 
				+	options.file_rank = -1;
			
 
				+
			
 
				+	starpu_fxt_generate_trace(&options);
			
 
				+}
			
 
				+
			
 
				 void _starpu_stop_fxt_profiling(void)
			
 
				 {
			
 
				 	if (!written)
			
@@ -92,6 +117,11 @@ void _starpu_stop_fxt_profiling(void)
 
				 #endif
			
 
				 		fut_endup(PROF_FILE_USER);
			
 
				 
			
 
				+		/* Should we generate a Paje trace directly ? */
			
 
				+		int generate_trace = starpu_get_env_number("STARPU_GENERATE_TRACE");
			
 
				+		if (generate_trace == 1)
			
 
				+			generate_paje_trace(PROF_FILE_USER, "paje.trace");
			
 
				+
			
 
				 		int ret = fut_done();
			
 
				 		if (ret < 0)
			
 
				 		{
			
@@ -109,9 +139,9 @@ void _starpu_fxt_register_thread(unsigned cpuid)
 
				 	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, cpuid, syscall(SYS_gettid));
			
 
				 }
			
 
				 
			
 
				-#endif
			
 
				+#endif // STARPU_USE_FXT
			
 
				 
			
 
				-void starpu_trace_user_event(unsigned long code __attribute__((unused)))
			
 
				+void starpu_trace_user_event(unsigned long code STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	STARPU_TRACE_USER_EVENT(code);
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -117,33 +117,58 @@ void _starpu_fxt_register_thread(unsigned);
 
				 /* Sometimes we need something a little more specific than the wrappers from
			
 
				  * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
			
 
				  * by a string. */
			
 
				-#define STARPU_FUT_DO_PROBE3STR(CODE, P1, P2, P3, str)				\
			
 
				+#define STARPU_FUT_DO_PROBE3STR(CODE, P1, P2, P3, str)			\
			
 
				 do {									\
			
 
				+	/* No more than FXT_MAX_PARAMS args are allowed */		\
			
 
				 	/* we add a \0 just in case ... */				\
			
 
				-	size_t len = strlen((str)) + 1;					\
			
 
				-	unsigned nbargs = 3 + (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
			
 
				+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 3)*sizeof(unsigned long));\
			
 
				+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
			
 
				+	unsigned nbargs = 3 + nbargs_str;				\
			
 
				 	size_t total_len = FUT_SIZE(nbargs);				\
			
 
				-	unsigned long *args =						\
			
 
				+	unsigned long *futargs =					\
			
 
				 		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
			
 
				-	*(args++) = (unsigned long)(P1);				\
			
 
				-	*(args++) = (unsigned long)(P2);				\
			
 
				-	*(args++) = (unsigned long)(P3);				\
			
 
				-	sprintf((char *)args, "%s", str);				\
			
 
				+	*(futargs++) = (unsigned long)(P1);				\
			
 
				+	*(futargs++) = (unsigned long)(P2);				\
			
 
				+	*(futargs++) = (unsigned long)(P3);				\
			
 
				+	snprintf((char *)futargs, len, "%s", str);			\
			
 
				+	((char *)futargs)[len - 1] = '\0';				\
			
 
				 } while (0);
			
 
				 
			
 
				 #define STARPU_FUT_DO_PROBE4STR(CODE, P1, P2, P3, P4, str)		\
			
 
				 do {									\
			
 
				+	/* No more than FXT_MAX_PARAMS args are allowed */		\
			
 
				 	/* we add a \0 just in case ... */				\
			
 
				-	size_t len = strlen((str)) + 1;					\
			
 
				-	unsigned nbargs = 4 + (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
			
 
				+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long));\
			
 
				+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
			
 
				+	unsigned nbargs = 4 + nbargs_str;				\
			
 
				 	size_t total_len = FUT_SIZE(nbargs);				\
			
 
				-	unsigned long *args =						\
			
 
				+	unsigned long *futargs =						\
			
 
				 		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
			
 
				-	*(args++) = (unsigned long)(P1);				\
			
 
				-	*(args++) = (unsigned long)(P2);				\
			
 
				-	*(args++) = (unsigned long)(P3);				\
			
 
				-	*(args++) = (unsigned long)(P4);				\
			
 
				-	sprintf((char *)args, "%s", str);				\
			
 
				+	*(futargs++) = (unsigned long)(P1);				\
			
 
				+	*(futargs++) = (unsigned long)(P2);				\
			
 
				+	*(futargs++) = (unsigned long)(P3);				\
			
 
				+	*(futargs++) = (unsigned long)(P4);				\
			
 
				+	snprintf((char *)futargs, len, "%s", str);			\
			
 
				+	((char *)futargs)[len - 1] = '\0';				\
			
 
				+} while (0);
			
 
				+
			
 
				+#define STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)		\
			
 
				+do {									\
			
 
				+	/* No more than FXT_MAX_PARAMS args are allowed */		\
			
 
				+	/* we add a \0 just in case ... */				\
			
 
				+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 5)*sizeof(unsigned long));\
			
 
				+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
			
 
				+	unsigned nbargs = 5 + nbargs_str;				\
			
 
				+	size_t total_len = FUT_SIZE(nbargs);				\
			
 
				+	unsigned long *futargs =					\
			
 
				+		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
			
 
				+	*(futargs++) = (unsigned long)(P1);				\
			
 
				+	*(futargs++) = (unsigned long)(P2);				\
			
 
				+	*(futargs++) = (unsigned long)(P3);				\
			
 
				+	*(futargs++) = (unsigned long)(P4);				\
			
 
				+	*(futargs++) = (unsigned long)(P5);				\
			
 
				+	snprintf((char *)futargs, len, "%s", str);			\
			
 
				+	((char *)futargs)[len - 1] = '\0';				\
			
 
				 } while (0);
			
 
				 
			
 
				 
			
@@ -160,7 +185,7 @@ do {									\
 
				 
			
 
				 #define STARPU_TRACE_START_CODELET_BODY(job)				\
			
 
				 do {									\
			
 
				-        const char *model_name = _starpu_get_model_name((job));               \
			
 
				+        const char *model_name = _starpu_get_model_name((job));         \
			
 
				 	if (model_name)                                                 \
			
 
				 	{								\
			
 
				 		/* we include the symbol name */			\
			
@@ -171,9 +196,12 @@ do {									\
 
				 	}								\
			
 
				 } while(0);
			
 
				 
			
 
				-
			
 
				-#define STARPU_TRACE_END_CODELET_BODY(job)	\
			
 
				-	FUT_DO_PROBE2(STARPU_FUT_END_CODELET_BODY, job, syscall(SYS_gettid));
			
 
				+#define STARPU_TRACE_END_CODELET_BODY(job, archtype)			\
			
 
				+do {									\
			
 
				+	const size_t job_size = _starpu_job_get_data_size((job));	\
			
 
				+	const uint32_t job_hash = _starpu_compute_buffers_footprint(job);\
			
 
				+	FUT_DO_PROBE5(STARPU_FUT_END_CODELET_BODY, job, (job_size), (job_hash), (archtype), syscall(SYS_gettid));	\
			
 
				+} while(0);
			
 
				 
			
 
				 #define STARPU_TRACE_START_CALLBACK(job)	\
			
 
				 	FUT_DO_PROBE2(STARPU_FUT_START_CALLBACK, job, syscall(SYS_gettid));
			
@@ -315,7 +343,7 @@ do {										\
 
				 #define STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0);
			
 
				 #define STARPU_TRACE_WORKER_INIT_END		do {} while(0);
			
 
				 #define STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0);
			
 
				-#define STARPU_TRACE_END_CODELET_BODY(job)	do {} while(0);
			
 
				+#define STARPU_TRACE_END_CODELET_BODY(job, a)	do {} while(0);
			
 
				 #define STARPU_TRACE_START_CALLBACK(job)	do {} while(0);
			
 
				 #define STARPU_TRACE_END_CALLBACK(job)		do {} while(0);
			
 
				 #define STARPU_TRACE_JOB_PUSH(task, prio)	do {} while(0);
			
--- a/src/common/starpu_spinlock.c
+++ b/src/common/starpu_spinlock.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -44,7 +44,7 @@ int _starpu_spin_init(starpu_spinlock_t *lock)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-int _starpu_spin_destroy(starpu_spinlock_t *lock)
			
 
				+int _starpu_spin_destroy(starpu_spinlock_t *lock STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef STARPU_SPINLOCK_CHECK
			
 
				 	pthread_mutexattr_destroy(&lock->errcheck_attr);
			
@@ -101,7 +101,7 @@ int _starpu_spin_trylock(starpu_spinlock_t *lock)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-int _starpu_spin_unlock(starpu_spinlock_t *lock)
			
 
				+int _starpu_spin_unlock(starpu_spinlock_t *lock STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef STARPU_SPINLOCK_CHECK
			
 
				 	int ret = pthread_mutex_unlock(&lock->errcheck_lock);
			
--- a/src/common/uthash.h
+++ b/src/common/uthash.h
@@ -0,0 +1,972 @@
 
				+/*
			
 
				+Copyright (c) 2003-2010, Troy D. Hanson     http://uthash.sourceforge.net
			
 
				+All rights reserved.
			
 
				+
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions are met:
			
 
				+
			
 
				+    * Redistributions of source code must retain the above copyright
			
 
				+      notice, this list of conditions and the following disclaimer.
			
 
				+
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
			
 
				+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
			
 
				+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
			
 
				+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
			
 
				+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
			
 
				+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
			
 
				+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
			
 
				+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
			
 
				+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
			
 
				+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+*/
			
 
				+
			
 
				+#ifndef UTHASH_H
			
 
				+#define UTHASH_H 
			
 
				+
			
 
				+#include <string.h>   /* memcmp,strlen */
			
 
				+#include <stddef.h>   /* ptrdiff_t */
			
 
				+
			
 
				+/* These macros use decltype or the earlier __typeof GNU extension.
			
 
				+   As decltype is only available in newer compilers (VS2010 or gcc 4.3+
			
 
				+   when compiling c++ source) this code uses whatever method is needed
			
 
				+   or, for VS2008 where neither is available, uses casting workarounds. */
			
 
				+#ifdef _MSC_VER         /* MS compiler */
			
 
				+#if _MSC_VER >= 1600 && defined(__cplusplus)  /* VS2010 or newer in C++ mode */
			
 
				+#define DECLTYPE(x) (decltype(x))
			
 
				+#else                   /* VS2008 or older (or VS2010 in C mode) */
			
 
				+#define NO_DECLTYPE
			
 
				+#define DECLTYPE(x)
			
 
				+#endif
			
 
				+#else                   /* GNU, Sun and other compilers */
			
 
				+#define DECLTYPE(x) (__typeof(x))
			
 
				+#endif
			
 
				+
			
 
				+#ifdef NO_DECLTYPE
			
 
				+#define DECLTYPE_ASSIGN(dst,src)                                                 \
			
 
				+do {                                                                             \
			
 
				+  char **_da_dst = (char**)(&(dst));                                             \
			
 
				+  *_da_dst = (char*)(src);                                                       \
			
 
				+} while(0)
			
 
				+#else 
			
 
				+#define DECLTYPE_ASSIGN(dst,src)                                                 \
			
 
				+do {                                                                             \
			
 
				+  (dst) = DECLTYPE(dst)(src);                                                    \
			
 
				+} while(0)
			
 
				+#endif
			
 
				+
			
 
				+/* a number of the hash function use uint32_t which isn't defined on win32 */
			
 
				+#ifdef _MSC_VER
			
 
				+typedef unsigned int uint32_t;
			
 
				+#else
			
 
				+#include <inttypes.h>   /* uint32_t */
			
 
				+#endif
			
 
				+
			
 
				+#define UTHASH_VERSION 1.9.3
			
 
				+
			
 
				+#define uthash_fatal(msg) exit(-1)        /* fatal error (out of memory,etc) */
			
 
				+#define uthash_malloc(sz) malloc(sz)      /* malloc fcn                      */
			
 
				+#define uthash_free(ptr,sz) free(ptr)     /* free fcn                        */
			
 
				+
			
 
				+#define uthash_noexpand_fyi(tbl)          /* can be defined to log noexpand  */
			
 
				+#define uthash_expand_fyi(tbl)            /* can be defined to log expands   */
			
 
				+
			
 
				+/* initial number of buckets */
			
 
				+#define HASH_INITIAL_NUM_BUCKETS 32      /* initial number of buckets        */
			
 
				+#define HASH_INITIAL_NUM_BUCKETS_LOG2 5  /* lg2 of initial number of buckets */
			
 
				+#define HASH_BKT_CAPACITY_THRESH 10      /* expand when bucket count reaches */
			
 
				+
			
 
				+/* calculate the element whose hash handle address is hhe */
			
 
				+#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho)))
			
 
				+
			
 
				+#define HASH_FIND(hh,head,keyptr,keylen,out)                                     \
			
 
				+do {                                                                             \
			
 
				+  unsigned _hf_bkt,_hf_hashv;                                                    \
			
 
				+  out=NULL;                                                                      \
			
 
				+  if (head) {                                                                    \
			
 
				+     HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt);   \
			
 
				+     if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) {                           \
			
 
				+       HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ],  \
			
 
				+                        keyptr,keylen,out);                                      \
			
 
				+     }                                                                           \
			
 
				+  }                                                                              \
			
 
				+} while (0)
			
 
				+
			
 
				+#ifdef HASH_BLOOM
			
 
				+#define HASH_BLOOM_BITLEN (1ULL << HASH_BLOOM)
			
 
				+#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8) + ((HASH_BLOOM_BITLEN%8) ? 1:0)
			
 
				+#define HASH_BLOOM_MAKE(tbl)                                                     \
			
 
				+do {                                                                             \
			
 
				+  (tbl)->bloom_nbits = HASH_BLOOM;                                               \
			
 
				+  (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN);                 \
			
 
				+  if (!((tbl)->bloom_bv))  { uthash_fatal( "out of memory"); }                   \
			
 
				+  memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN);                                \
			
 
				+  (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                       \
			
 
				+} while (0);
			
 
				+
			
 
				+#define HASH_BLOOM_FREE(tbl)                                                     \
			
 
				+do {                                                                             \
			
 
				+  uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
			
 
				+} while (0);
			
 
				+
			
 
				+#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
			
 
				+#define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
			
 
				+
			
 
				+#define HASH_BLOOM_ADD(tbl,hashv)                                                \
			
 
				+  HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))
			
 
				+
			
 
				+#define HASH_BLOOM_TEST(tbl,hashv)                                               \
			
 
				+  HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))
			
 
				+
			
 
				+#else
			
 
				+#define HASH_BLOOM_MAKE(tbl) 
			
 
				+#define HASH_BLOOM_FREE(tbl) 
			
 
				+#define HASH_BLOOM_ADD(tbl,hashv) 
			
 
				+#define HASH_BLOOM_TEST(tbl,hashv) (1)
			
 
				+#endif
			
 
				+
			
 
				+#define HASH_MAKE_TABLE(hh,head)                                                 \
			
 
				+do {                                                                             \
			
 
				+  (head)->hh.tbl = (UT_hash_table*)uthash_malloc(                                \
			
 
				+                  sizeof(UT_hash_table));                                        \
			
 
				+  if (!((head)->hh.tbl))  { uthash_fatal( "out of memory"); }                    \
			
 
				+  memset((head)->hh.tbl, 0, sizeof(UT_hash_table));                              \
			
 
				+  (head)->hh.tbl->tail = &((head)->hh);                                          \
			
 
				+  (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS;                        \
			
 
				+  (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2;              \
			
 
				+  (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head);                    \
			
 
				+  (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc(                      \
			
 
				+          HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket));               \
			
 
				+  if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); }             \
			
 
				+  memset((head)->hh.tbl->buckets, 0,                                             \
			
 
				+          HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket));               \
			
 
				+  HASH_BLOOM_MAKE((head)->hh.tbl);                                               \
			
 
				+  (head)->hh.tbl->signature = HASH_SIGNATURE;                                    \
			
 
				+} while(0)
			
 
				+
			
 
				+#define HASH_ADD(hh,head,fieldname,keylen_in,add)                                \
			
 
				+        HASH_ADD_KEYPTR(hh,head,&add->fieldname,keylen_in,add)
			
 
				+ 
			
 
				+#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add)                            \
			
 
				+do {                                                                             \
			
 
				+ unsigned _ha_bkt;                                                               \
			
 
				+ (add)->hh.next = NULL;                                                          \
			
 
				+ (add)->hh.key = (char*)keyptr;                                                  \
			
 
				+ (add)->hh.keylen = keylen_in;                                                   \
			
 
				+ if (!(head)) {                                                                  \
			
 
				+    head = (add);                                                                \
			
 
				+    (head)->hh.prev = NULL;                                                      \
			
 
				+    HASH_MAKE_TABLE(hh,head);                                                    \
			
 
				+ } else {                                                                        \
			
 
				+    (head)->hh.tbl->tail->next = (add);                                          \
			
 
				+    (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail);         \
			
 
				+    (head)->hh.tbl->tail = &((add)->hh);                                         \
			
 
				+ }                                                                               \
			
 
				+ (head)->hh.tbl->num_items++;                                                    \
			
 
				+ (add)->hh.tbl = (head)->hh.tbl;                                                 \
			
 
				+ HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets,                         \
			
 
				+         (add)->hh.hashv, _ha_bkt);                                              \
			
 
				+ HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh);                   \
			
 
				+ HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv);                                 \
			
 
				+ HASH_EMIT_KEY(hh,head,keyptr,keylen_in);                                        \
			
 
				+ HASH_FSCK(hh,head);                                                             \
			
 
				+} while(0)
			
 
				+
			
 
				+#define HASH_TO_BKT( hashv, num_bkts, bkt )                                      \
			
 
				+do {                                                                             \
			
 
				+  bkt = ((hashv) & ((num_bkts) - 1));                                            \
			
 
				+} while(0)
			
 
				+
			
 
				+/* delete "delptr" from the hash table.
			
 
				+ * "the usual" patch-up process for the app-order doubly-linked-list.
			
 
				+ * The use of _hd_hh_del below deserves special explanation.
			
 
				+ * These used to be expressed using (delptr) but that led to a bug
			
 
				+ * if someone used the same symbol for the head and deletee, like
			
 
				+ *  HASH_DELETE(hh,users,users);
			
 
				+ * We want that to work, but by changing the head (users) below
			
 
				+ * we were forfeiting our ability to further refer to the deletee (users)
			
 
				+ * in the patch-up process. Solution: use scratch space to
			
 
				+ * copy the deletee pointer, then the latter references are via that
			
 
				+ * scratch pointer rather than through the repointed (users) symbol.
			
 
				+ */
			
 
				+#define HASH_DELETE(hh,head,delptr)                                              \
			
 
				+do {                                                                             \
			
 
				+    unsigned _hd_bkt;                                                            \
			
 
				+    struct UT_hash_handle *_hd_hh_del;                                           \
			
 
				+    if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) )  {         \
			
 
				+        uthash_free((head)->hh.tbl->buckets,                                     \
			
 
				+                    (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \
			
 
				+        HASH_BLOOM_FREE((head)->hh.tbl);                                         \
			
 
				+        uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                      \
			
 
				+        head = NULL;                                                             \
			
 
				+    } else {                                                                     \
			
 
				+        _hd_hh_del = &((delptr)->hh);                                            \
			
 
				+        if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) {     \
			
 
				+            (head)->hh.tbl->tail =                                               \
			
 
				+                (UT_hash_handle*)((char*)((delptr)->hh.prev) +                   \
			
 
				+                (head)->hh.tbl->hho);                                            \
			
 
				+        }                                                                        \
			
 
				+        if ((delptr)->hh.prev) {                                                 \
			
 
				+            ((UT_hash_handle*)((char*)((delptr)->hh.prev) +                      \
			
 
				+                    (head)->hh.tbl->hho))->next = (delptr)->hh.next;             \
			
 
				+        } else {                                                                 \
			
 
				+            DECLTYPE_ASSIGN(head,(delptr)->hh.next);                             \
			
 
				+        }                                                                        \
			
 
				+        if (_hd_hh_del->next) {                                                  \
			
 
				+            ((UT_hash_handle*)((char*)_hd_hh_del->next +                         \
			
 
				+                    (head)->hh.tbl->hho))->prev =                                \
			
 
				+                    _hd_hh_del->prev;                                            \
			
 
				+        }                                                                        \
			
 
				+        HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt);   \
			
 
				+        HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del);        \
			
 
				+        (head)->hh.tbl->num_items--;                                             \
			
 
				+    }                                                                            \
			
 
				+    HASH_FSCK(hh,head);                                                          \
			
 
				+} while (0)
			
 
				+
			
 
				+
			
 
				+/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */
			
 
				+#define HASH_FIND_STR(head,findstr,out)                                          \
			
 
				+    HASH_FIND(hh,head,findstr,strlen(findstr),out)
			
 
				+#define HASH_ADD_STR(head,strfield,add)                                          \
			
 
				+    HASH_ADD(hh,head,strfield,strlen(add->strfield),add)
			
 
				+#define HASH_FIND_INT(head,findint,out)                                          \
			
 
				+    HASH_FIND(hh,head,findint,sizeof(int),out)
			
 
				+#define HASH_ADD_INT(head,intfield,add)                                          \
			
 
				+    HASH_ADD(hh,head,intfield,sizeof(int),add)
			
 
				+#define HASH_FIND_PTR(head,findptr,out)                                          \
			
 
				+    HASH_FIND(hh,head,findptr,sizeof(void *),out)
			
 
				+#define HASH_ADD_PTR(head,ptrfield,add)                                          \
			
 
				+    HASH_ADD(hh,head,ptrfield,sizeof(void *),add)
			
 
				+#define HASH_DEL(head,delptr)                                                    \
			
 
				+    HASH_DELETE(hh,head,delptr)
			
 
				+
			
 
				+/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined.
			
 
				+ * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined.
			
 
				+ */
			
 
				+#ifdef HASH_DEBUG
			
 
				+#define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0)
			
 
				+#define HASH_FSCK(hh,head)                                                       \
			
 
				+do {                                                                             \
			
 
				+    unsigned _bkt_i;                                                             \
			
 
				+    unsigned _count, _bkt_count;                                                 \
			
 
				+    char *_prev;                                                                 \
			
 
				+    struct UT_hash_handle *_thh;                                                 \
			
 
				+    if (head) {                                                                  \
			
 
				+        _count = 0;                                                              \
			
 
				+        for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) {       \
			
 
				+            _bkt_count = 0;                                                      \
			
 
				+            _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head;                      \
			
 
				+            _prev = NULL;                                                        \
			
 
				+            while (_thh) {                                                       \
			
 
				+               if (_prev != (char*)(_thh->hh_prev)) {                            \
			
 
				+                   HASH_OOPS("invalid hh_prev %p, actual %p\n",                  \
			
 
				+                    _thh->hh_prev, _prev );                                      \
			
 
				+               }                                                                 \
			
 
				+               _bkt_count++;                                                     \
			
 
				+               _prev = (char*)(_thh);                                            \
			
 
				+               _thh = _thh->hh_next;                                             \
			
 
				+            }                                                                    \
			
 
				+            _count += _bkt_count;                                                \
			
 
				+            if ((head)->hh.tbl->buckets[_bkt_i].count !=  _bkt_count) {          \
			
 
				+               HASH_OOPS("invalid bucket count %d, actual %d\n",                 \
			
 
				+                (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count);              \
			
 
				+            }                                                                    \
			
 
				+        }                                                                        \
			
 
				+        if (_count != (head)->hh.tbl->num_items) {                               \
			
 
				+            HASH_OOPS("invalid hh item count %d, actual %d\n",                   \
			
 
				+                (head)->hh.tbl->num_items, _count );                             \
			
 
				+        }                                                                        \
			
 
				+        /* traverse hh in app order; check next/prev integrity, count */         \
			
 
				+        _count = 0;                                                              \
			
 
				+        _prev = NULL;                                                            \
			
 
				+        _thh =  &(head)->hh;                                                     \
			
 
				+        while (_thh) {                                                           \
			
 
				+           _count++;                                                             \
			
 
				+           if (_prev !=(char*)(_thh->prev)) {                                    \
			
 
				+              HASH_OOPS("invalid prev %p, actual %p\n",                          \
			
 
				+                    _thh->prev, _prev );                                         \
			
 
				+           }                                                                     \
			
 
				+           _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh);                    \
			
 
				+           _thh = ( _thh->next ?  (UT_hash_handle*)((char*)(_thh->next) +        \
			
 
				+                                  (head)->hh.tbl->hho) : NULL );                 \
			
 
				+        }                                                                        \
			
 
				+        if (_count != (head)->hh.tbl->num_items) {                               \
			
 
				+            HASH_OOPS("invalid app item count %d, actual %d\n",                  \
			
 
				+                (head)->hh.tbl->num_items, _count );                             \
			
 
				+        }                                                                        \
			
 
				+    }                                                                            \
			
 
				+} while (0)
			
 
				+#else
			
 
				+#define HASH_FSCK(hh,head) 
			
 
				+#endif
			
 
				+
			
 
				+/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to 
			
 
				+ * the descriptor to which this macro is defined for tuning the hash function.
			
 
				+ * The app can #include <unistd.h> to get the prototype for write(2). */
			
 
				+#ifdef HASH_EMIT_KEYS
			
 
				+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)                                   \
			
 
				+do {                                                                             \
			
 
				+    unsigned _klen = fieldlen;                                                   \
			
 
				+    write(HASH_EMIT_KEYS, &_klen, sizeof(_klen));                                \
			
 
				+    write(HASH_EMIT_KEYS, keyptr, fieldlen);                                     \
			
 
				+} while (0)
			
 
				+#else 
			
 
				+#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)                    
			
 
				+#endif
			
 
				+
			
 
				+/* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */
			
 
				+#ifdef HASH_FUNCTION 
			
 
				+#define HASH_FCN HASH_FUNCTION
			
 
				+#else
			
 
				+#define HASH_FCN HASH_JEN
			
 
				+#endif
			
 
				+
			
 
				+/* The Bernstein hash function, used in Perl prior to v5.6 */
			
 
				+#define HASH_BER(key,keylen,num_bkts,hashv,bkt)                                  \
			
 
				+do {                                                                             \
			
 
				+  unsigned _hb_keylen=keylen;                                                    \
			
 
				+  char *_hb_key=(char*)(key);                                                    \
			
 
				+  (hashv) = 0;                                                                   \
			
 
				+  while (_hb_keylen--)  { (hashv) = ((hashv) * 33) + *_hb_key++; }               \
			
 
				+  bkt = (hashv) & (num_bkts-1);                                                  \
			
 
				+} while (0)
			
 
				+
			
 
				+
			
 
				+/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at 
			
 
				+ * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */
			
 
				+#define HASH_SAX(key,keylen,num_bkts,hashv,bkt)                                  \
			
 
				+do {                                                                             \
			
 
				+  unsigned _sx_i;                                                                \
			
 
				+  char *_hs_key=(char*)(key);                                                    \
			
 
				+  hashv = 0;                                                                     \
			
 
				+  for(_sx_i=0; _sx_i < keylen; _sx_i++)                                          \
			
 
				+      hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i];                     \
			
 
				+  bkt = hashv & (num_bkts-1);                                                    \
			
 
				+} while (0)
			
 
				+
			
 
				+#define HASH_FNV(key,keylen,num_bkts,hashv,bkt)                                  \
			
 
				+do {                                                                             \
			
 
				+  unsigned _fn_i;                                                                \
			
 
				+  char *_hf_key=(char*)(key);                                                    \
			
 
				+  hashv = 2166136261UL;                                                          \
			
 
				+  for(_fn_i=0; _fn_i < keylen; _fn_i++)                                          \
			
 
				+      hashv = (hashv * 16777619) ^ _hf_key[_fn_i];                               \
			
 
				+  bkt = hashv & (num_bkts-1);                                                    \
			
 
				+} while(0);
			
 
				+ 
			
 
				+#define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
			
 
				+do {                                                                             \
			
 
				+  unsigned _ho_i;                                                                \
			
 
				+  char *_ho_key=(char*)(key);                                                    \
			
 
				+  hashv = 0;                                                                     \
			
 
				+  for(_ho_i=0; _ho_i < keylen; _ho_i++) {                                        \
			
 
				+      hashv += _ho_key[_ho_i];                                                   \
			
 
				+      hashv += (hashv << 10);                                                    \
			
 
				+      hashv ^= (hashv >> 6);                                                     \
			
 
				+  }                                                                              \
			
 
				+  hashv += (hashv << 3);                                                         \
			
 
				+  hashv ^= (hashv >> 11);                                                        \
			
 
				+  hashv += (hashv << 15);                                                        \
			
 
				+  bkt = hashv & (num_bkts-1);                                                    \
			
 
				+} while(0)
			
 
				+
			
 
				+#define HASH_JEN_MIX(a,b,c)                                                      \
			
 
				+do {                                                                             \
			
 
				+  a -= b; a -= c; a ^= ( c >> 13 );                                              \
			
 
				+  b -= c; b -= a; b ^= ( a << 8 );                                               \
			
 
				+  c -= a; c -= b; c ^= ( b >> 13 );                                              \
			
 
				+  a -= b; a -= c; a ^= ( c >> 12 );                                              \
			
 
				+  b -= c; b -= a; b ^= ( a << 16 );                                              \
			
 
				+  c -= a; c -= b; c ^= ( b >> 5 );                                               \
			
 
				+  a -= b; a -= c; a ^= ( c >> 3 );                                               \
			
 
				+  b -= c; b -= a; b ^= ( a << 10 );                                              \
			
 
				+  c -= a; c -= b; c ^= ( b >> 15 );                                              \
			
 
				+} while (0)
			
 
				+
			
 
				+#define HASH_JEN(key,keylen,num_bkts,hashv,bkt)                                  \
			
 
				+do {                                                                             \
			
 
				+  unsigned _hj_i,_hj_j,_hj_k;                                                    \
			
 
				+  char *_hj_key=(char*)(key);                                                    \
			
 
				+  hashv = 0xfeedbeef;                                                            \
			
 
				+  _hj_i = _hj_j = 0x9e3779b9;                                                    \
			
 
				+  _hj_k = keylen;                                                                \
			
 
				+  while (_hj_k >= 12) {                                                          \
			
 
				+    _hj_i +=    (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 )                      \
			
 
				+        + ( (unsigned)_hj_key[2] << 16 )                                         \
			
 
				+        + ( (unsigned)_hj_key[3] << 24 ) );                                      \
			
 
				+    _hj_j +=    (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 )                      \
			
 
				+        + ( (unsigned)_hj_key[6] << 16 )                                         \
			
 
				+        + ( (unsigned)_hj_key[7] << 24 ) );                                      \
			
 
				+    hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 )                         \
			
 
				+        + ( (unsigned)_hj_key[10] << 16 )                                        \
			
 
				+        + ( (unsigned)_hj_key[11] << 24 ) );                                     \
			
 
				+                                                                                 \
			
 
				+     HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                          \
			
 
				+                                                                                 \
			
 
				+     _hj_key += 12;                                                              \
			
 
				+     _hj_k -= 12;                                                                \
			
 
				+  }                                                                              \
			
 
				+  hashv += keylen;                                                               \
			
 
				+  switch ( _hj_k ) {                                                             \
			
 
				+     case 11: hashv += ( (unsigned)_hj_key[10] << 24 );                          \
			
 
				+     case 10: hashv += ( (unsigned)_hj_key[9] << 16 );                           \
			
 
				+     case 9:  hashv += ( (unsigned)_hj_key[8] << 8 );                            \
			
 
				+     case 8:  _hj_j += ( (unsigned)_hj_key[7] << 24 );                           \
			
 
				+     case 7:  _hj_j += ( (unsigned)_hj_key[6] << 16 );                           \
			
 
				+     case 6:  _hj_j += ( (unsigned)_hj_key[5] << 8 );                            \
			
 
				+     case 5:  _hj_j += _hj_key[4];                                               \
			
 
				+     case 4:  _hj_i += ( (unsigned)_hj_key[3] << 24 );                           \
			
 
				+     case 3:  _hj_i += ( (unsigned)_hj_key[2] << 16 );                           \
			
 
				+     case 2:  _hj_i += ( (unsigned)_hj_key[1] << 8 );                            \
			
 
				+     case 1:  _hj_i += _hj_key[0];                                               \
			
 
				+  }                                                                              \
			
 
				+  HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                             \
			
 
				+  bkt = hashv & (num_bkts-1);                                                    \
			
 
				+} while(0)
			
 
				+
			
 
				+/* The Paul Hsieh hash function */
			
 
				+#undef get16bits
			
 
				+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__)             \
			
 
				+  || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
			
 
				+#define get16bits(d) (*((const uint16_t *) (d)))
			
 
				+#endif
			
 
				+
			
 
				+#if !defined (get16bits)
			
 
				+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)             \
			
 
				+                       +(uint32_t)(((const uint8_t *)(d))[0]) )
			
 
				+#endif
			
 
				+#define HASH_SFH(key,keylen,num_bkts,hashv,bkt)                                  \
			
 
				+do {                                                                             \
			
 
				+  char *_sfh_key=(char*)(key);                                                   \
			
 
				+  uint32_t _sfh_tmp, _sfh_len = keylen;                                          \
			
 
				+                                                                                 \
			
 
				+  int _sfh_rem = _sfh_len & 3;                                                   \
			
 
				+  _sfh_len >>= 2;                                                                \
			
 
				+  hashv = 0xcafebabe;                                                            \
			
 
				+                                                                                 \
			
 
				+  /* Main loop */                                                                \
			
 
				+  for (;_sfh_len > 0; _sfh_len--) {                                              \
			
 
				+    hashv    += get16bits (_sfh_key);                                            \
			
 
				+    _sfh_tmp       = (get16bits (_sfh_key+2) << 11) ^ hashv;                     \
			
 
				+    hashv     = (hashv << 16) ^ _sfh_tmp;                                        \
			
 
				+    _sfh_key += 2*sizeof (uint16_t);                                             \
			
 
				+    hashv    += hashv >> 11;                                                     \
			
 
				+  }                                                                              \
			
 
				+                                                                                 \
			
 
				+  /* Handle end cases */                                                         \
			
 
				+  switch (_sfh_rem) {                                                            \
			
 
				+    case 3: hashv += get16bits (_sfh_key);                                       \
			
 
				+            hashv ^= hashv << 16;                                                \
			
 
				+            hashv ^= _sfh_key[sizeof (uint16_t)] << 18;                          \
			
 
				+            hashv += hashv >> 11;                                                \
			
 
				+            break;                                                               \
			
 
				+    case 2: hashv += get16bits (_sfh_key);                                       \
			
 
				+            hashv ^= hashv << 11;                                                \
			
 
				+            hashv += hashv >> 17;                                                \
			
 
				+            break;                                                               \
			
 
				+    case 1: hashv += *_sfh_key;                                                  \
			
 
				+            hashv ^= hashv << 10;                                                \
			
 
				+            hashv += hashv >> 1;                                                 \
			
 
				+  }                                                                              \
			
 
				+                                                                                 \
			
 
				+    /* Force "avalanching" of final 127 bits */                                  \
			
 
				+    hashv ^= hashv << 3;                                                         \
			
 
				+    hashv += hashv >> 5;                                                         \
			
 
				+    hashv ^= hashv << 4;                                                         \
			
 
				+    hashv += hashv >> 17;                                                        \
			
 
				+    hashv ^= hashv << 25;                                                        \
			
 
				+    hashv += hashv >> 6;                                                         \
			
 
				+    bkt = hashv & (num_bkts-1);                                                  \
			
 
				+} while(0);
			
 
				+
			
 
				+#ifdef HASH_USING_NO_STRICT_ALIASING
			
 
				+/* The MurmurHash exploits some CPU's (e.g. x86) tolerance for unaligned reads.
			
 
				+ * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error.
			
 
				+ * So MurmurHash comes in two versions, the faster unaligned one and the slower
			
 
				+ * aligned one. We only use the faster one on CPU's where we know it's safe. 
			
 
				+ *
			
 
				+ * Note the preprocessor built-in defines can be emitted using:
			
 
				+ *
			
 
				+ *   gcc -m64 -dM -E - < /dev/null                  (on gcc)
			
 
				+ *   cc -## a.c (where a.c is a simple test file)   (Sun Studio)
			
 
				+ */
			
 
				+#if (defined(__i386__) || defined(__x86_64__)) 
			
 
				+#define HASH_MUR HASH_MUR_UNALIGNED
			
 
				+#else
			
 
				+#define HASH_MUR HASH_MUR_ALIGNED
			
 
				+#endif
			
 
				+
			
 
				+/* Appleby's MurmurHash fast version for unaligned-tolerant archs like i386 */
			
 
				+#define HASH_MUR_UNALIGNED(key,keylen,num_bkts,hashv,bkt)                        \
			
 
				+do {                                                                             \
			
 
				+  const unsigned int _mur_m = 0x5bd1e995;                                        \
			
 
				+  const int _mur_r = 24;                                                         \
			
 
				+  hashv = 0xcafebabe ^ keylen;                                                   \
			
 
				+  char *_mur_key = (char *)(key);                                                \
			
 
				+  uint32_t _mur_tmp, _mur_len = keylen;                                          \
			
 
				+                                                                                 \
			
 
				+  for (;_mur_len >= 4; _mur_len-=4) {                                            \
			
 
				+    _mur_tmp = *(uint32_t *)_mur_key;                                            \
			
 
				+    _mur_tmp *= _mur_m;                                                          \
			
 
				+    _mur_tmp ^= _mur_tmp >> _mur_r;                                              \
			
 
				+    _mur_tmp *= _mur_m;                                                          \
			
 
				+    hashv *= _mur_m;                                                             \
			
 
				+    hashv ^= _mur_tmp;                                                           \
			
 
				+    _mur_key += 4;                                                               \
			
 
				+  }                                                                              \
			
 
				+                                                                                 \
			
 
				+  switch(_mur_len)                                                               \
			
 
				+  {                                                                              \
			
 
				+    case 3: hashv ^= _mur_key[2] << 16;                                          \
			
 
				+    case 2: hashv ^= _mur_key[1] << 8;                                           \
			
 
				+    case 1: hashv ^= _mur_key[0];                                                \
			
 
				+            hashv *= _mur_m;                                                     \
			
 
				+  };                                                                             \
			
 
				+                                                                                 \
			
 
				+  hashv ^= hashv >> 13;                                                          \
			
 
				+  hashv *= _mur_m;                                                               \
			
 
				+  hashv ^= hashv >> 15;                                                          \
			
 
				+                                                                                 \
			
 
				+  bkt = hashv & (num_bkts-1);                                                    \
			
 
				+} while(0)
			
 
				+
			
 
				+/* Appleby's MurmurHash version for alignment-sensitive archs like Sparc */
			
 
				+#define HASH_MUR_ALIGNED(key,keylen,num_bkts,hashv,bkt)                          \
			
 
				+do {                                                                             \
			
 
				+  const unsigned int _mur_m = 0x5bd1e995;                                        \
			
 
				+  const int _mur_r = 24;                                                         \
			
 
				+  hashv = 0xcafebabe ^ (keylen);                                                 \
			
 
				+  char *_mur_key = (char *)(key);                                                \
			
 
				+  uint32_t _mur_len = keylen;                                                    \
			
 
				+  int _mur_align = (int)_mur_key & 3;                                            \
			
 
				+                                                                                 \
			
 
				+  if (_mur_align && (_mur_len >= 4)) {                                           \
			
 
				+    unsigned _mur_t = 0, _mur_d = 0;                                             \
			
 
				+    switch(_mur_align) {                                                         \
			
 
				+      case 1: _mur_t |= _mur_key[2] << 16;                                       \
			
 
				+      case 2: _mur_t |= _mur_key[1] << 8;                                        \
			
 
				+      case 3: _mur_t |= _mur_key[0];                                             \
			
 
				+    }                                                                            \
			
 
				+    _mur_t <<= (8 * _mur_align);                                                 \
			
 
				+    _mur_key += 4-_mur_align;                                                    \
			
 
				+    _mur_len -= 4-_mur_align;                                                    \
			
 
				+    int _mur_sl = 8 * (4-_mur_align);                                            \
			
 
				+    int _mur_sr = 8 * _mur_align;                                                \
			
 
				+                                                                                 \
			
 
				+    for (;_mur_len >= 4; _mur_len-=4) {                                          \
			
 
				+      _mur_d = *(unsigned *)_mur_key;                                            \
			
 
				+      _mur_t = (_mur_t >> _mur_sr) | (_mur_d << _mur_sl);                        \
			
 
				+      unsigned _mur_k = _mur_t;                                                  \
			
 
				+      _mur_k *= _mur_m;                                                          \
			
 
				+      _mur_k ^= _mur_k >> _mur_r;                                                \
			
 
				+      _mur_k *= _mur_m;                                                          \
			
 
				+      hashv *= _mur_m;                                                           \
			
 
				+      hashv ^= _mur_k;                                                           \
			
 
				+      _mur_t = _mur_d;                                                           \
			
 
				+      _mur_key += 4;                                                             \
			
 
				+    }                                                                            \
			
 
				+    _mur_d = 0;                                                                  \
			
 
				+    if(_mur_len >= _mur_align) {                                                 \
			
 
				+      switch(_mur_align) {                                                       \
			
 
				+        case 3: _mur_d |= _mur_key[2] << 16;                                     \
			
 
				+        case 2: _mur_d |= _mur_key[1] << 8;                                      \
			
 
				+        case 1: _mur_d |= _mur_key[0];                                           \
			
 
				+      }                                                                          \
			
 
				+      unsigned _mur_k = (_mur_t >> _mur_sr) | (_mur_d << _mur_sl);               \
			
 
				+      _mur_k *= _mur_m;                                                          \
			
 
				+      _mur_k ^= _mur_k >> _mur_r;                                                \
			
 
				+      _mur_k *= _mur_m;                                                          \
			
 
				+      hashv *= _mur_m;                                                           \
			
 
				+      hashv ^= _mur_k;                                                           \
			
 
				+      _mur_k += _mur_align;                                                      \
			
 
				+      _mur_len -= _mur_align;                                                    \
			
 
				+                                                                                 \
			
 
				+      switch(_mur_len)                                                           \
			
 
				+      {                                                                          \
			
 
				+        case 3: hashv ^= _mur_key[2] << 16;                                      \
			
 
				+        case 2: hashv ^= _mur_key[1] << 8;                                       \
			
 
				+        case 1: hashv ^= _mur_key[0];                                            \
			
 
				+                hashv *= _mur_m;                                                 \
			
 
				+      }                                                                          \
			
 
				+    } else {                                                                     \
			
 
				+      switch(_mur_len)                                                           \
			
 
				+      {                                                                          \
			
 
				+        case 3: _mur_d ^= _mur_key[2] << 16;                                     \
			
 
				+        case 2: _mur_d ^= _mur_key[1] << 8;                                      \
			
 
				+        case 1: _mur_d ^= _mur_key[0];                                           \
			
 
				+        case 0: hashv ^= (_mur_t >> _mur_sr) | (_mur_d << _mur_sl);              \
			
 
				+        hashv *= _mur_m;                                                         \
			
 
				+      }                                                                          \
			
 
				+    }                                                                            \
			
 
				+                                                                                 \
			
 
				+    hashv ^= hashv >> 13;                                                        \
			
 
				+    hashv *= _mur_m;                                                             \
			
 
				+    hashv ^= hashv >> 15;                                                        \
			
 
				+  } else {                                                                       \
			
 
				+    for (;_mur_len >= 4; _mur_len-=4) {                                          \
			
 
				+      unsigned _mur_k = *(unsigned*)_mur_key;                                    \
			
 
				+      _mur_k *= _mur_m;                                                          \
			
 
				+      _mur_k ^= _mur_k >> _mur_r;                                                \
			
 
				+      _mur_k *= _mur_m;                                                          \
			
 
				+      hashv *= _mur_m;                                                           \
			
 
				+      hashv ^= _mur_k;                                                           \
			
 
				+      _mur_key += 4;                                                             \
			
 
				+    }                                                                            \
			
 
				+    switch(_mur_len)                                                             \
			
 
				+    {                                                                            \
			
 
				+      case 3: hashv ^= _mur_key[2] << 16;                                        \
			
 
				+      case 2: hashv ^= _mur_key[1] << 8;                                         \
			
 
				+      case 1: hashv ^= _mur_key[0];                                              \
			
 
				+      hashv *= _mur_m;                                                           \
			
 
				+    }                                                                            \
			
 
				+                                                                                 \
			
 
				+    hashv ^= hashv >> 13;                                                        \
			
 
				+    hashv *= _mur_m;                                                             \
			
 
				+    hashv ^= hashv >> 15;                                                        \
			
 
				+  }                                                                              \
			
 
				+  bkt = hashv & (num_bkts-1);                                                    \
			
 
				+} while(0)
			
 
				+#endif  /* HASH_USING_NO_STRICT_ALIASING */
			
 
				+
			
 
				+/* key comparison function; return 0 if keys equal */
			
 
				+#define HASH_KEYCMP(a,b,len) memcmp(a,b,len) 
			
 
				+
			
 
				+/* iterate over items in a known bucket to find desired item */
			
 
				+#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out)                       \
			
 
				+do {                                                                             \
			
 
				+ if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head));          \
			
 
				+ else out=NULL;                                                                  \
			
 
				+ while (out) {                                                                   \
			
 
				+    if (out->hh.keylen == keylen_in) {                                           \
			
 
				+        if ((HASH_KEYCMP(out->hh.key,keyptr,keylen_in)) == 0) break;             \
			
 
				+    }                                                                            \
			
 
				+    if (out->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,out->hh.hh_next)); \
			
 
				+    else out = NULL;                                                             \
			
 
				+ }                                                                               \
			
 
				+} while(0)
			
 
				+
			
 
				+/* add an item to a bucket  */
			
 
				+#define HASH_ADD_TO_BKT(head,addhh)                                              \
			
 
				+do {                                                                             \
			
 
				+ head.count++;                                                                   \
			
 
				+ (addhh)->hh_next = head.hh_head;                                                \
			
 
				+ (addhh)->hh_prev = NULL;                                                        \
			
 
				+ if (head.hh_head) { (head).hh_head->hh_prev = (addhh); }                        \
			
 
				+ (head).hh_head=addhh;                                                           \
			
 
				+ if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH)             \
			
 
				+     && (addhh)->tbl->noexpand != 1) {                                           \
			
 
				+       HASH_EXPAND_BUCKETS((addhh)->tbl);                                        \
			
 
				+ }                                                                               \
			
 
				+} while(0)
			
 
				+
			
 
				+/* remove an item from a given bucket */
			
 
				+#define HASH_DEL_IN_BKT(hh,head,hh_del)                                          \
			
 
				+    (head).count--;                                                              \
			
 
				+    if ((head).hh_head == hh_del) {                                              \
			
 
				+      (head).hh_head = hh_del->hh_next;                                          \
			
 
				+    }                                                                            \
			
 
				+    if (hh_del->hh_prev) {                                                       \
			
 
				+        hh_del->hh_prev->hh_next = hh_del->hh_next;                              \
			
 
				+    }                                                                            \
			
 
				+    if (hh_del->hh_next) {                                                       \
			
 
				+        hh_del->hh_next->hh_prev = hh_del->hh_prev;                              \
			
 
				+    }                                                                
			
 
				+
			
 
				+/* Bucket expansion has the effect of doubling the number of buckets
			
 
				+ * and redistributing the items into the new buckets. Ideally the
			
 
				+ * items will distribute more or less evenly into the new buckets
			
 
				+ * (the extent to which this is true is a measure of the quality of
			
 
				+ * the hash function as it applies to the key domain). 
			
 
				+ * 
			
 
				+ * With the items distributed into more buckets, the chain length
			
 
				+ * (item count) in each bucket is reduced. Thus by expanding buckets
			
 
				+ * the hash keeps a bound on the chain length. This bounded chain 
			
 
				+ * length is the essence of how a hash provides constant time lookup.
			
 
				+ * 
			
 
				+ * The calculation of tbl->ideal_chain_maxlen below deserves some
			
 
				+ * explanation. First, keep in mind that we're calculating the ideal
			
 
				+ * maximum chain length based on the *new* (doubled) bucket count.
			
 
				+ * In fractions this is just n/b (n=number of items,b=new num buckets).
			
 
				+ * Since the ideal chain length is an integer, we want to calculate 
			
 
				+ * ceil(n/b). We don't depend on floating point arithmetic in this
			
 
				+ * hash, so to calculate ceil(n/b) with integers we could write
			
 
				+ * 
			
 
				+ *      ceil(n/b) = (n/b) + ((n%b)?1:0)
			
 
				+ * 
			
 
				+ * and in fact a previous version of this hash did just that.
			
 
				+ * But now we have improved things a bit by recognizing that b is
			
 
				+ * always a power of two. We keep its base 2 log handy (call it lb),
			
 
				+ * so now we can write this with a bit shift and logical AND:
			
 
				+ * 
			
 
				+ *      ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
			
 
				+ * 
			
 
				+ */
			
 
				+#define HASH_EXPAND_BUCKETS(tbl)                                                 \
			
 
				+do {                                                                             \
			
 
				+    unsigned _he_bkt;                                                            \
			
 
				+    unsigned _he_bkt_i;                                                          \
			
 
				+    struct UT_hash_handle *_he_thh, *_he_hh_nxt;                                 \
			
 
				+    UT_hash_bucket *_he_new_buckets, *_he_newbkt;                                \
			
 
				+    _he_new_buckets = (UT_hash_bucket*)uthash_malloc(                            \
			
 
				+             2 * tbl->num_buckets * sizeof(struct UT_hash_bucket));              \
			
 
				+    if (!_he_new_buckets) { uthash_fatal( "out of memory"); }                    \
			
 
				+    memset(_he_new_buckets, 0,                                                   \
			
 
				+            2 * tbl->num_buckets * sizeof(struct UT_hash_bucket));               \
			
 
				+    tbl->ideal_chain_maxlen =                                                    \
			
 
				+       (tbl->num_items >> (tbl->log2_num_buckets+1)) +                           \
			
 
				+       ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0);                    \
			
 
				+    tbl->nonideal_items = 0;                                                     \
			
 
				+    for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++)                \
			
 
				+    {                                                                            \
			
 
				+        _he_thh = tbl->buckets[ _he_bkt_i ].hh_head;                             \
			
 
				+        while (_he_thh) {                                                        \
			
 
				+           _he_hh_nxt = _he_thh->hh_next;                                        \
			
 
				+           HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt);            \
			
 
				+           _he_newbkt = &(_he_new_buckets[ _he_bkt ]);                           \
			
 
				+           if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) {                \
			
 
				+             tbl->nonideal_items++;                                              \
			
 
				+             _he_newbkt->expand_mult = _he_newbkt->count /                       \
			
 
				+                                        tbl->ideal_chain_maxlen;                 \
			
 
				+           }                                                                     \
			
 
				+           _he_thh->hh_prev = NULL;                                              \
			
 
				+           _he_thh->hh_next = _he_newbkt->hh_head;                               \
			
 
				+           if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev =               \
			
 
				+                _he_thh;                                                         \
			
 
				+           _he_newbkt->hh_head = _he_thh;                                        \
			
 
				+           _he_thh = _he_hh_nxt;                                                 \
			
 
				+        }                                                                        \
			
 
				+    }                                                                            \
			
 
				+    uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \
			
 
				+    tbl->num_buckets *= 2;                                                       \
			
 
				+    tbl->log2_num_buckets++;                                                     \
			
 
				+    tbl->buckets = _he_new_buckets;                                              \
			
 
				+    tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ?         \
			
 
				+        (tbl->ineff_expands+1) : 0;                                              \
			
 
				+    if (tbl->ineff_expands > 1) {                                                \
			
 
				+        tbl->noexpand=1;                                                         \
			
 
				+        uthash_noexpand_fyi(tbl);                                                \
			
 
				+    }                                                                            \
			
 
				+    uthash_expand_fyi(tbl);                                                      \
			
 
				+} while(0)
			
 
				+
			
 
				+
			
 
				+/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */
			
 
				+/* Note that HASH_SORT assumes the hash handle name to be hh. 
			
 
				+ * HASH_SRT was added to allow the hash handle name to be passed in. */
			
 
				+#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn)
			
 
				+#define HASH_SRT(hh,head,cmpfcn)                                                 \
			
 
				+do {                                                                             \
			
 
				+  unsigned _hs_i;                                                                \
			
 
				+  unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize;               \
			
 
				+  struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail;            \
			
 
				+  if (head) {                                                                    \
			
 
				+      _hs_insize = 1;                                                            \
			
 
				+      _hs_looping = 1;                                                           \
			
 
				+      _hs_list = &((head)->hh);                                                  \
			
 
				+      while (_hs_looping) {                                                      \
			
 
				+          _hs_p = _hs_list;                                                      \
			
 
				+          _hs_list = NULL;                                                       \
			
 
				+          _hs_tail = NULL;                                                       \
			
 
				+          _hs_nmerges = 0;                                                       \
			
 
				+          while (_hs_p) {                                                        \
			
 
				+              _hs_nmerges++;                                                     \
			
 
				+              _hs_q = _hs_p;                                                     \
			
 
				+              _hs_psize = 0;                                                     \
			
 
				+              for ( _hs_i = 0; _hs_i  < _hs_insize; _hs_i++ ) {                  \
			
 
				+                  _hs_psize++;                                                   \
			
 
				+                  _hs_q = (UT_hash_handle*)((_hs_q->next) ?                      \
			
 
				+                          ((void*)((char*)(_hs_q->next) +                        \
			
 
				+                          (head)->hh.tbl->hho)) : NULL);                         \
			
 
				+                  if (! (_hs_q) ) break;                                         \
			
 
				+              }                                                                  \
			
 
				+              _hs_qsize = _hs_insize;                                            \
			
 
				+              while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) {           \
			
 
				+                  if (_hs_psize == 0) {                                          \
			
 
				+                      _hs_e = _hs_q;                                             \
			
 
				+                      _hs_q = (UT_hash_handle*)((_hs_q->next) ?                  \
			
 
				+                              ((void*)((char*)(_hs_q->next) +                    \
			
 
				+                              (head)->hh.tbl->hho)) : NULL);                     \
			
 
				+                      _hs_qsize--;                                               \
			
 
				+                  } else if ( (_hs_qsize == 0) || !(_hs_q) ) {                   \
			
 
				+                      _hs_e = _hs_p;                                             \
			
 
				+                      _hs_p = (UT_hash_handle*)((_hs_p->next) ?                  \
			
 
				+                              ((void*)((char*)(_hs_p->next) +                    \
			
 
				+                              (head)->hh.tbl->hho)) : NULL);                     \
			
 
				+                      _hs_psize--;                                               \
			
 
				+                  } else if ((                                                   \
			
 
				+                      cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \
			
 
				+                             DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \
			
 
				+                             ) <= 0) {                                           \
			
 
				+                      _hs_e = _hs_p;                                             \
			
 
				+                      _hs_p = (UT_hash_handle*)((_hs_p->next) ?                  \
			
 
				+                              ((void*)((char*)(_hs_p->next) +                    \
			
 
				+                              (head)->hh.tbl->hho)) : NULL);                     \
			
 
				+                      _hs_psize--;                                               \
			
 
				+                  } else {                                                       \
			
 
				+                      _hs_e = _hs_q;                                             \
			
 
				+                      _hs_q = (UT_hash_handle*)((_hs_q->next) ?                  \
			
 
				+                              ((void*)((char*)(_hs_q->next) +                    \
			
 
				+                              (head)->hh.tbl->hho)) : NULL);                     \
			
 
				+                      _hs_qsize--;                                               \
			
 
				+                  }                                                              \
			
 
				+                  if ( _hs_tail ) {                                              \
			
 
				+                      _hs_tail->next = ((_hs_e) ?                                \
			
 
				+                            ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL);          \
			
 
				+                  } else {                                                       \
			
 
				+                      _hs_list = _hs_e;                                          \
			
 
				+                  }                                                              \
			
 
				+                  _hs_e->prev = ((_hs_tail) ?                                    \
			
 
				+                     ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL);              \
			
 
				+                  _hs_tail = _hs_e;                                              \
			
 
				+              }                                                                  \
			
 
				+              _hs_p = _hs_q;                                                     \
			
 
				+          }                                                                      \
			
 
				+          _hs_tail->next = NULL;                                                 \
			
 
				+          if ( _hs_nmerges <= 1 ) {                                              \
			
 
				+              _hs_looping=0;                                                     \
			
 
				+              (head)->hh.tbl->tail = _hs_tail;                                   \
			
 
				+              DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list));      \
			
 
				+          }                                                                      \
			
 
				+          _hs_insize *= 2;                                                       \
			
 
				+      }                                                                          \
			
 
				+      HASH_FSCK(hh,head);                                                        \
			
 
				+ }                                                                               \
			
 
				+} while (0)
			
 
				+
			
 
				+/* This function selects items from one hash into another hash. 
			
 
				+ * The end result is that the selected items have dual presence 
			
 
				+ * in both hashes. There is no copy of the items made; rather 
			
 
				+ * they are added into the new hash through a secondary hash 
			
 
				+ * hash handle that must be present in the structure. */
			
 
				+#define HASH_SELECT(hh_dst, dst, hh_src, src, cond)                              \
			
 
				+do {                                                                             \
			
 
				+  unsigned _src_bkt, _dst_bkt;                                                   \
			
 
				+  void *_last_elt=NULL, *_elt;                                                   \
			
 
				+  UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL;                         \
			
 
				+  ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst));                 \
			
 
				+  if (src) {                                                                     \
			
 
				+    for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) {     \
			
 
				+      for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head;                \
			
 
				+          _src_hh;                                                               \
			
 
				+          _src_hh = _src_hh->hh_next) {                                          \
			
 
				+          _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh);                       \
			
 
				+          if (cond(_elt)) {                                                      \
			
 
				+            _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho);               \
			
 
				+            _dst_hh->key = _src_hh->key;                                         \
			
 
				+            _dst_hh->keylen = _src_hh->keylen;                                   \
			
 
				+            _dst_hh->hashv = _src_hh->hashv;                                     \
			
 
				+            _dst_hh->prev = _last_elt;                                           \
			
 
				+            _dst_hh->next = NULL;                                                \
			
 
				+            if (_last_elt_hh) { _last_elt_hh->next = _elt; }                     \
			
 
				+            if (!dst) {                                                          \
			
 
				+              DECLTYPE_ASSIGN(dst,_elt);                                         \
			
 
				+              HASH_MAKE_TABLE(hh_dst,dst);                                       \
			
 
				+            } else {                                                             \
			
 
				+              _dst_hh->tbl = (dst)->hh_dst.tbl;                                  \
			
 
				+            }                                                                    \
			
 
				+            HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt);    \
			
 
				+            HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh);            \
			
 
				+            (dst)->hh_dst.tbl->num_items++;                                      \
			
 
				+            _last_elt = _elt;                                                    \
			
 
				+            _last_elt_hh = _dst_hh;                                              \
			
 
				+          }                                                                      \
			
 
				+      }                                                                          \
			
 
				+    }                                                                            \
			
 
				+  }                                                                              \
			
 
				+  HASH_FSCK(hh_dst,dst);                                                         \
			
 
				+} while (0)
			
 
				+
			
 
				+#define HASH_CLEAR(hh,head)                                                      \
			
 
				+do {                                                                             \
			
 
				+  if (head) {                                                                    \
			
 
				+    uthash_free((head)->hh.tbl->buckets,                                         \
			
 
				+                (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket));      \
			
 
				+    uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                          \
			
 
				+    (head)=NULL;                                                                 \
			
 
				+  }                                                                              \
			
 
				+} while(0)
			
 
				+
			
 
				+#ifdef NO_DECLTYPE
			
 
				+#define HASH_ITER(hh,head,el,tmp)                                                \
			
 
				+for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL);       \
			
 
				+  el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL)) 
			
 
				+#else
			
 
				+#define HASH_ITER(hh,head,el,tmp)                                                \
			
 
				+for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL);                 \
			
 
				+  el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL))
			
 
				+#endif
			
 
				+
			
 
				+/* obtain a count of items in the hash */
			
 
				+#define HASH_COUNT(head) HASH_CNT(hh,head) 
			
 
				+#define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0)
			
 
				+
			
 
				+typedef struct UT_hash_bucket {
			
 
				+   struct UT_hash_handle *hh_head;
			
 
				+   unsigned count;
			
 
				+
			
 
				+   /* expand_mult is normally set to 0. In this situation, the max chain length
			
 
				+    * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If
			
 
				+    * the bucket's chain exceeds this length, bucket expansion is triggered). 
			
 
				+    * However, setting expand_mult to a non-zero value delays bucket expansion
			
 
				+    * (that would be triggered by additions to this particular bucket)
			
 
				+    * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH.
			
 
				+    * (The multiplier is simply expand_mult+1). The whole idea of this
			
 
				+    * multiplier is to reduce bucket expansions, since they are expensive, in
			
 
				+    * situations where we know that a particular bucket tends to be overused.
			
 
				+    * It is better to let its chain length grow to a longer yet-still-bounded
			
 
				+    * value, than to do an O(n) bucket expansion too often. 
			
 
				+    */
			
 
				+   unsigned expand_mult;
			
 
				+
			
 
				+} UT_hash_bucket;
			
 
				+
			
 
				+/* random signature used only to find hash tables in external analysis */
			
 
				+#define HASH_SIGNATURE 0xa0111fe1
			
 
				+#define HASH_BLOOM_SIGNATURE 0xb12220f2
			
 
				+
			
 
				+typedef struct UT_hash_table {
			
 
				+   UT_hash_bucket *buckets;
			
 
				+   unsigned num_buckets, log2_num_buckets;
			
 
				+   unsigned num_items;
			
 
				+   struct UT_hash_handle *tail; /* tail hh in app order, for fast append    */
			
 
				+   ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */
			
 
				+
			
 
				+   /* in an ideal situation (all buckets used equally), no bucket would have
			
 
				+    * more than ceil(#items/#buckets) items. that's the ideal chain length. */
			
 
				+   unsigned ideal_chain_maxlen;
			
 
				+
			
 
				+   /* nonideal_items is the number of items in the hash whose chain position
			
 
				+    * exceeds the ideal chain maxlen. these items pay the penalty for an uneven
			
 
				+    * hash distribution; reaching them in a chain traversal takes >ideal steps */
			
 
				+   unsigned nonideal_items;
			
 
				+
			
 
				+   /* ineffective expands occur when a bucket doubling was performed, but 
			
 
				+    * afterward, more than half the items in the hash had nonideal chain
			
 
				+    * positions. If this happens on two consecutive expansions we inhibit any
			
 
				+    * further expansion, as it's not helping; this happens when the hash
			
 
				+    * function isn't a good fit for the key domain. When expansion is inhibited
			
 
				+    * the hash will still work, albeit no longer in constant time. */
			
 
				+   unsigned ineff_expands, noexpand;
			
 
				+
			
 
				+   uint32_t signature; /* used only to find hash tables in external analysis */
			
 
				+#ifdef HASH_BLOOM
			
 
				+   uint32_t bloom_sig; /* used only to test bloom exists in external analysis */
			
 
				+   uint8_t *bloom_bv;
			
 
				+   char bloom_nbits;
			
 
				+#endif
			
 
				+
			
 
				+} UT_hash_table;
			
 
				+
			
 
				+typedef struct UT_hash_handle {
			
 
				+   struct UT_hash_table *tbl;
			
 
				+   void *prev;                       /* prev element in app order      */
			
 
				+   void *next;                       /* next element in app order      */
			
 
				+   struct UT_hash_handle *hh_prev;   /* previous hh in bucket order    */
			
 
				+   struct UT_hash_handle *hh_next;   /* next hh in bucket order        */
			
 
				+   void *key;                        /* ptr to enclosing struct's key  */
			
 
				+   unsigned keylen;                  /* enclosing struct's key len     */
			
 
				+   unsigned hashv;                   /* result of hash-fcn(key)        */
			
 
				+} UT_hash_handle;
			
 
				+
			
 
				+#endif /* UTHASH_H */
			
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -27,15 +27,15 @@
 
				 #include <stdlib.h>
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE
			
 
				-#  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); }} while(0)
			
 
				+#  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); fflush(stderr); }} while(0)
			
 
				 #else
			
 
				 #  define _STARPU_DEBUG(fmt, args ...)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE0
			
 
				-#  define _STARPU_LOG_IN()             fprintf(stderr, "[starpu][%ld][%s] -->\n", pthread_self(), __func__ );
			
 
				-#  define _STARPU_LOG_OUT()            fprintf(stderr, "[starpu][%ld][%s] <--\n", pthread_self(), __func__ );
			
 
				-#  define _STARPU_LOG_OUT_TAG(outtag)  fprintf(stderr, "[starpu][%ld][%s] <-- (%s)\n", pthread_self(), __func__, outtag);
			
 
				+#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] -->\n", pthread_self(), __func__ ); }} while(0)
			
 
				+#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] <--\n", pthread_self(), __func__ ); }} while(0)
			
 
				+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] <-- (%s)\n", pthread_self(), __func__, outtag); }} while(0)
			
 
				 #else
			
 
				 #  define _STARPU_LOG_IN()
			
 
				 #  define _STARPU_LOG_OUT()
			
--- a/src/core/combined_workers.c
+++ b/src/core/combined_workers.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -123,6 +123,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
				 
			
 
				 	for (i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				+#if defined(__GLIBC__) || defined(STARPU_HAVE_HWLOC)
			
 
				 		int id = workerid_array[i];
			
 
				 #ifdef __GLIBC__
			
 
				 #ifdef CPU_OR
			
@@ -143,6 +144,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
				 				combined_worker->hwloc_cpu_set,
			
 
				 				config->workers[id].initial_hwloc_cpu_set);
			
 
				 #endif
			
 
				+#endif
			
 
				 	}
			
 
				 
			
 
				 	return new_workerid;
			
--- a/src/core/debug.c
+++ b/src/core/debug.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,6 +25,12 @@ static pthread_mutex_t logfile_mutex = PTHREAD_MUTEX_INITIALIZER;
 
				 static FILE *logfile;
			
 
				 #endif
			
 
				 
			
 
				+int _starpu_use_fxt
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+	= 1
			
 
				+#endif
			
 
				+	;
			
 
				+
			
 
				 void _starpu_open_debug_logfile(void)
			
 
				 {
			
 
				 #ifdef STARPU_VERBOSE
			
@@ -49,7 +55,7 @@ void _starpu_close_debug_logfile(void)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-void _starpu_print_to_logfile(const char *format __attribute__((unused)), ...)
			
 
				+void _starpu_print_to_logfile(const char *format STARPU_ATTRIBUTE_UNUSED, ...)
			
 
				 {
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 	va_list args;
			
--- a/src/core/debug.h
+++ b/src/core/debug.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -34,4 +34,7 @@ void _starpu_close_debug_logfile(void);
 
				 /* Write into StarPU's log file */
			
 
				 void _starpu_print_to_logfile(const char *format, ...);
			
 
				 
			
 
				+/* Tell gdb whether FXT is compiled in or not */
			
 
				+extern int _starpu_use_fxt;
			
 
				+
			
 
				 #endif // __DEBUG_H__
			
--- a/src/core/dependencies/cg.c
+++ b/src/core/dependencies/cg.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -111,7 +111,9 @@ void _starpu_notify_cg(starpu_cg_t *cg)
 
				 	
			
 
				 				tag_successors->ndeps_completed++;
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning FIXME: who locks this?
			
 
				+#endif
			
 
				 				if ((tag->state == STARPU_BLOCKED) &&
			
 
				 					(tag_successors->ndeps == tag_successors->ndeps_completed)) {
			
 
				 					/* reset the counter so that we can reuse the completion group */
			
@@ -158,7 +160,7 @@ void _starpu_notify_cg_list(struct starpu_cg_list_s *successors)
 
				 		struct starpu_cg_s *cg = successors->succ[succ];
			
 
				 		STARPU_ASSERT(cg);
			
 
				 
			
 
				-		struct starpu_tag_s *cgtag;
			
 
				+		struct starpu_tag_s *cgtag = NULL;
			
 
				 
			
 
				 		unsigned cg_type = cg->cg_type;
			
 
				 
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -182,7 +182,7 @@ static void disable_last_writer_callback(void *cl_arg)
 
				  * */
			
 
				 /* NB : handle->sequential_consistency_mutex must be hold by the caller */
			
 
				 void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
			
 
				-						   starpu_data_handle handle, starpu_access_mode mode)
			
 
				+						starpu_data_handle handle, starpu_access_mode mode)
			
 
				 {
			
 
				 	STARPU_ASSERT(!(mode & STARPU_SCRATCH));
			
 
				         _STARPU_LOG_IN();
			
@@ -233,7 +233,7 @@ void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_
 
				 	
			
 
				 		}
			
 
				 		else {
			
 
				-			_STARPU_DEP_DEBUG("R %p\n", handle);
			
 
				+			_STARPU_DEP_DEBUG("R %p %d -> %d\n", handle, previous_mode, mode);
			
 
				 			/* Add a reader, after a writer or a reader. */
			
 
				 			STARPU_ASSERT(pre_sync_task);
			
 
				 			STARPU_ASSERT(post_sync_task);
			
@@ -255,7 +255,10 @@ void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_
 
				 				new_sync_task->cl = NULL;
			
 
				 				new_sync_task->callback_func = disable_last_writer_callback;
			
 
				 				new_sync_task->callback_arg = handle;
			
 
				-				
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+				_starpu_get_job_associated_to_task(new_sync_task)->model_name = "sync_task_redux";
			
 
				+#endif
			
 
				+
			
 
				 				_starpu_add_writer_after_readers(handle, new_sync_task, new_sync_task);
			
 
				 
			
 
				 				starpu_task_submit(new_sync_task);
			
@@ -461,6 +464,9 @@ int _starpu_data_wait_until_available(starpu_data_handle handle, starpu_access_m
 
				 		sync_task = starpu_task_create();
			
 
				 		sync_task->detach = 0;
			
 
				 		sync_task->destroy = 1;
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+		_starpu_get_job_associated_to_task(sync_task)->model_name = "sync_task";
			
 
				+#endif
			
 
				 
			
 
				 		/* It is not really a RW access, but we want to make sure that
			
 
				 		 * all previous accesses are done */
			
--- a/src/core/dependencies/implicit_data_deps.h
+++ b/src/core/dependencies/implicit_data_deps.h
@@ -22,7 +22,7 @@
 
				 #include <common/config.h>
			
 
				 
			
 
				 void _starpu_detect_implicit_data_deps_with_handle(struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task,
			
 
				-						   starpu_data_handle handle, starpu_access_mode mode);
			
 
				+						starpu_data_handle handle, starpu_access_mode mode);
			
 
				 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
			
 
				 void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle handle);
			
 
				 
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -328,10 +328,8 @@ int starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id)
 
				 
			
 
				 	PTHREAD_MUTEX_LOCK(&cg->succ.succ_apps.cg_mutex);
			
 
				 
			
 
				-	while (!cg->succ.succ_apps.completed){
			
 
				-	  //	  printf("cond wait\n");
			
 
				+	while (!cg->succ.succ_apps.completed)
			
 
				 		PTHREAD_COND_WAIT(&cg->succ.succ_apps.cg_cond, &cg->succ.succ_apps.cg_mutex);
			
 
				-	}
			
 
				 
			
 
				 	PTHREAD_MUTEX_UNLOCK(&cg->succ.succ_apps.cg_mutex);
			
 
				 
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,6 +24,7 @@
 
				 #include <common/utils.h>
			
 
				 #include <profiling/profiling.h>
			
 
				 #include <profiling/bound.h>
			
 
				+#include <starpu_top.h>
			
 
				 
			
 
				 size_t _starpu_job_get_data_size(starpu_job_t j)
			
 
				 {
			
@@ -68,7 +69,7 @@ starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task
 
				 	job->terminated = 0;
			
 
				 
			
 
				 #ifndef STARPU_USE_FXT
			
 
				-	if (_starpu_bound_recording)
			
 
				+	if (_starpu_bound_recording || starpu_top_status_get())
			
 
				 #endif
			
 
				 		job->job_id = STARPU_ATOMIC_ADD(&job_cnt, 1);
			
 
				 #ifdef STARPU_USE_FXT
			
@@ -227,7 +228,6 @@ void _starpu_handle_job_termination(starpu_job_t j, unsigned job_is_already_lock
 
				 		/* We reuse the same job structure */
			
 
				 		int ret = _starpu_submit_job(j, 1);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				-		printf("did not decrement\n");
			
 
				 	}	
			
 
				 	else {
			
 
				 		_starpu_decrement_nsubmitted_tasks();
			
@@ -270,7 +270,9 @@ static unsigned _starpu_not_all_tag_deps_are_fulfilled(starpu_job_t j)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning TODO remove the job_is_already_locked parameter
			
 
				+#endif
			
 
				 static unsigned _starpu_not_all_task_deps_are_fulfilled(starpu_job_t j, unsigned job_is_already_locked)
			
 
				 {
			
 
				 	unsigned ret;
			
@@ -304,7 +306,9 @@ static unsigned _starpu_not_all_task_deps_are_fulfilled(starpu_job_t j, unsigned
 
				  *	In order, we enforce tag, task and data dependencies. The task is
			
 
				  *	passed to the scheduler only once all these constraints are fulfilled.
			
 
				  */
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning TODO remove the job_is_already_locked parameter
			
 
				+#endif
			
 
				 unsigned _starpu_enforce_deps_and_schedule(starpu_job_t j, unsigned job_is_already_locked)
			
 
				 {
			
 
				 	unsigned ret;
			
@@ -335,7 +339,9 @@ unsigned _starpu_enforce_deps_and_schedule(starpu_job_t j, unsigned job_is_alrea
 
				 }
			
 
				 
			
 
				 /* Tag deps are already fulfilled */
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning TODO remove the job_is_already_locked parameter
			
 
				+#endif
			
 
				 unsigned _starpu_enforce_deps_starting_from_task(starpu_job_t j, unsigned job_is_already_locked)
			
 
				 {
			
 
				 	unsigned ret;
			
@@ -372,6 +378,7 @@ int _starpu_push_local_task(struct starpu_worker_s *worker, struct starpu_task *
 
				 		return -ENODEV;
			
 
				 
			
 
				 	PTHREAD_MUTEX_LOCK(worker->sched_mutex);
			
 
				+
			
 
				 	if (back)
			
 
				 		starpu_task_list_push_back(&worker->local_tasks, task);
			
 
				 	else
			
@@ -393,10 +400,11 @@ const char *_starpu_get_model_name(starpu_job_t j)
 
				             && task->cl->model
			
 
				             && task->cl->model->symbol)
			
 
				                 return task->cl->model->symbol;
			
 
				-#ifdef STARPU_USE_FXT
			
 
				         else {
			
 
				+#ifdef STARPU_USE_FXT
			
 
				                 return j->model_name;
			
 
				-        }
			
 
				+#else
			
 
				+                return NULL;
			
 
				 #endif
			
 
				-        return NULL;
			
 
				+        }
			
 
				 }
			
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -35,6 +35,7 @@
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				 #include <core/errorcheck.h>
			
 
				+#include <common/barrier.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -72,14 +72,6 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel_t *model, enum
 
				 	double exp = -1.0;
			
 
				 	double (*per_arch_cost_model)(struct starpu_buffer_descr_t *);
			
 
				 	
			
 
				-	if (!model->is_loaded)
			
 
				-	{
			
 
				-		model->benchmarking = _starpu_get_calibrate_flag();
			
 
				-		
			
 
				-		_starpu_register_model(model);
			
 
				-		model->is_loaded = 1;
			
 
				-	}
			
 
				-
			
 
				 	per_arch_cost_model = model->per_arch[arch].cost_model;
			
 
				 
			
 
				 	if (per_arch_cost_model)
			
@@ -134,6 +126,33 @@ static double common_task_expected_perf(struct starpu_perfmodel_t *model, enum s
 
				 	return -1.0;
			
 
				 }
			
 
				 
			
 
				+void _starpu_load_perfmodel(struct starpu_perfmodel_t *model)
			
 
				+{
			
 
				+	if (!model || model->is_loaded)
			
 
				+		return;
			
 
				+
			
 
				+	switch (model->type) {
			
 
				+		case STARPU_PER_ARCH:
			
 
				+		case STARPU_COMMON:
			
 
				+			break;
			
 
				+
			
 
				+		case STARPU_HISTORY_BASED:
			
 
				+		case STARPU_NL_REGRESSION_BASED:
			
 
				+			_starpu_load_history_based_model(model, 1);
			
 
				+			break;
			
 
				+
			
 
				+		case STARPU_REGRESSION_BASED:
			
 
				+			_starpu_load_history_based_model(model, 0);
			
 
				+			break;
			
 
				+
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+	}
			
 
				+
			
 
				+	_starpu_register_model(model);
			
 
				+	model->is_loaded = 1;
			
 
				+}
			
 
				+
			
 
				 static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch)
			
 
				 {
			
 
				 	if (model) {
			
@@ -146,8 +165,8 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
				 				return common_task_expected_perf(model, arch, task);
			
 
				 
			
 
				 			case STARPU_HISTORY_BASED:
			
 
				-			    return _starpu_history_based_job_expected_perf(model, arch, j);
			
 
				-			  
			
 
				+				return _starpu_history_based_job_expected_perf(model, arch, j);
			
 
				+
			
 
				 			case STARPU_REGRESSION_BASED:
			
 
				 				return _starpu_regression_based_job_expected_perf(model, arch, j);
			
 
				 
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -93,6 +93,8 @@ void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 
				 
			
 
				 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, struct starpu_job_s *j);
			
 
				 void _starpu_register_model(struct starpu_perfmodel_t *model);
			
 
				+void _starpu_load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history);
			
 
				+void _starpu_load_perfmodel(struct starpu_perfmodel_t *model);
			
 
				 void _starpu_initialize_registered_performance_models(void);
			
 
				 void _starpu_deinitialize_registered_performance_models(void);
			
 
				 
			
@@ -111,8 +113,6 @@ double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_
 
				 void _starpu_set_calibrate_flag(unsigned val);
			
 
				 unsigned _starpu_get_calibrate_flag(void);
			
 
				 
			
 
				-enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
			
 
				-
			
 
				 #if defined(STARPU_USE_CUDA)
			
 
				 int *_starpu_get_cuda_affinity_vector(unsigned gpuid);
			
 
				 #endif
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -27,6 +27,7 @@
 
				 #include <math.h>
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 #include <starpu_opencl.h>
			
 
				 #include <common/config.h>
			
 
				 #include <core/workers.h>
			
@@ -65,6 +66,7 @@ static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
 
				 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
			
 
				 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
			
 
				 static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
			
 
				+static size_t cuda_size = SIZE;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
			
@@ -98,10 +100,16 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				 
			
 
				+        /* Get the maximum size which can be allocated on the device */
			
 
				+	struct cudaDeviceProp prop;
			
 
				+	cudaError_t cures;
			
 
				+	cures = cudaGetDeviceProperties(&prop, dev);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+        if (cuda_size > prop.totalGlobalMem/4) cuda_size = prop.totalGlobalMem/4;
			
 
				 
			
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *d_buffer;
			
 
				-	cudaMalloc((void **)&d_buffer, SIZE);
			
 
				+	cudaMalloc((void **)&d_buffer, cuda_size);
			
 
				 	assert(d_buffer);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
@@ -110,7 +118,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 
			
 
				 	/* Allocate a buffer on the host */
			
 
				 	unsigned char *h_buffer;
			
 
				-	cudaHostAlloc((void **)&h_buffer, SIZE, 0);
			
 
				+	cudaHostAlloc((void **)&h_buffer, cuda_size, 0);
			
 
				 	assert(h_buffer);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
@@ -118,8 +126,8 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 
			
 
				 
			
 
				 	/* Fill them */
			
 
				-	memset(h_buffer, 0, SIZE);
			
 
				-	cudaMemset(d_buffer, 0, SIZE);
			
 
				+	memset(h_buffer, 0, cuda_size);
			
 
				+	cudaMemset(d_buffer, 0, cuda_size);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
@@ -134,7 +142,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	gettimeofday(&start, NULL);
			
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
 
				-		cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
			
 
				+		cudaMemcpy(d_buffer, h_buffer, cuda_size, cudaMemcpyHostToDevice);
			
 
				 		cudaThreadSynchronize();
			
 
				 	}
			
 
				 	gettimeofday(&end, NULL);
			
@@ -146,7 +154,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	gettimeofday(&start, NULL);
			
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
 
				-		cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
			
 
				+		cudaMemcpy(h_buffer, d_buffer, cuda_size, cudaMemcpyDeviceToHost);
			
 
				 		cudaThreadSynchronize();
			
 
				 	}
			
 
				 	gettimeofday(&end, NULL);
			
@@ -183,7 +191,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 
				         starpu_opencl_get_device(dev, &device);
			
 
				 	err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-        if (opencl_size > (size_t)maxMemAllocSize) opencl_size = maxMemAllocSize;
			
 
				+        if (opencl_size > (size_t)maxMemAllocSize/4) opencl_size = maxMemAllocSize/4;
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
@@ -378,12 +386,12 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
				 
			
 
				 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
			
 
				 
			
 
				-		_STARPU_DISP("BANDWIDTH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
			
 
				+		_STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
			
 
				 	}
			
 
				 
			
 
				 	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
			
 
				 
			
 
				-	_STARPU_DISP("BANDWIDTH GPU %d BEST CPU %d\n", dev, best_cpu);
			
 
				+	_STARPU_DISP("BANDWIDTH GPU %d BEST CPU %u\n", dev, best_cpu);
			
 
				 #endif
			
 
				 
			
 
				 	/* The results are sorted in a decreasing order, so that the best
			
@@ -396,7 +404,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
				 static void benchmark_all_gpu_devices(void)
			
 
				 {
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				-	int i, ret;
			
 
				+	int i;
			
 
				 
			
 
				 	_STARPU_DEBUG("Benchmarking the speed of the bus\n");
			
 
				 
			
@@ -409,6 +417,7 @@ static void benchmark_all_gpu_devices(void)
 
				 #ifdef __linux__
			
 
				 	/* Save the current cpu binding */
			
 
				 	cpu_set_t former_process_affinity;
			
 
				+	int ret;
			
 
				 	ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
			
 
				 	if (ret)
			
 
				 	{
			
@@ -466,7 +475,11 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 
				 	strncat(path, type, maxlen);
			
 
				 
			
 
				 	char hostname[32];
			
 
				-	gethostname(hostname, 32);
			
 
				+	char *forced_hostname = getenv("STARPU_HOSTNAME");
			
 
				+	if (forced_hostname && forced_hostname[0])
			
 
				+		snprintf(hostname, sizeof(hostname), forced_hostname);
			
 
				+	else
			
 
				+		gethostname(hostname, sizeof(hostname));
			
 
				 	strncat(path, ".", maxlen);
			
 
				 	strncat(path, hostname, maxlen);
			
 
				 }
			
@@ -574,7 +587,7 @@ static void write_bus_affinity_file_content(void)
 
				 
			
 
				         fprintf(f, "# GPU\t");
			
 
				 	for (cpu = 0; cpu < ncpus; cpu++)
			
 
				-		fprintf(f, "CPU%d\t", cpu);
			
 
				+		fprintf(f, "CPU%u\t", cpu);
			
 
				 	fprintf(f, "\n");
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -656,7 +669,7 @@ static void get_latency_path(char *path, size_t maxlen)
 
				 	get_bus_path("latency", path, maxlen);
			
 
				 }
			
 
				 
			
 
				-static void load_bus_latency_file_content(void)
			
 
				+static int load_bus_latency_file_content(void)
			
 
				 {
			
 
				 	int n;
			
 
				 	unsigned src, dst;
			
@@ -675,17 +688,29 @@ static void load_bus_latency_file_content(void)
 
				 		{
			
 
				 			double latency;
			
 
				 
			
 
				-			n = fscanf(f, "%lf\t", &latency);
			
 
				-			STARPU_ASSERT(n == 1);
			
 
				+			n = fscanf(f, "%lf", &latency);
			
 
				+			if (n != 1) {
			
 
				+				fclose(f);
			
 
				+				return 0;
			
 
				+			}
			
 
				+			n = getc(f);
			
 
				+			if (n != '\t') {
			
 
				+				fclose(f);
			
 
				+				return 0;
			
 
				+			}
			
 
				 
			
 
				 			latency_matrix[src][dst] = latency;
			
 
				 		}
			
 
				 
			
 
				-		n = fscanf(f, "\n");
			
 
				-		STARPU_ASSERT(n == 0);
			
 
				+		n = getc(f);
			
 
				+		if (n != '\n') {
			
 
				+			fclose(f);
			
 
				+			return 0;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	fclose(f);
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 static void write_bus_latency_file_content(void)
			
@@ -735,7 +760,7 @@ static void write_bus_latency_file_content(void)
 
				                                 latency = ((src && dst)?2000.0:500.0);
			
 
				 			}
			
 
				 
			
 
				-			fprintf(f, "%lf\t", latency);
			
 
				+			fprintf(f, "%f\t", latency);
			
 
				 		}
			
 
				 
			
 
				 		fprintf(f, "\n");
			
@@ -760,13 +785,12 @@ static void load_bus_latency_file(void)
 
				 	get_latency_path(path, 256);
			
 
				 
			
 
				 	res = access(path, F_OK);
			
 
				-	if (res)
			
 
				+	if (res || !load_bus_latency_file_content())
			
 
				 	{
			
 
				-		/* File does not exist yet */
			
 
				+		/* File does not exist yet or is bogus */
			
 
				 		generate_bus_latency_file();
			
 
				 	}
			
 
				 
			
 
				-	load_bus_latency_file_content();
			
 
				 }
			
 
				 
			
 
				 
			
@@ -778,7 +802,7 @@ static void get_bandwidth_path(char *path, size_t maxlen)
 
				 	get_bus_path("bandwidth", path, maxlen);
			
 
				 }
			
 
				 
			
 
				-static void load_bus_bandwidth_file_content(void)
			
 
				+static int load_bus_bandwidth_file_content(void)
			
 
				 {
			
 
				 	int n;
			
 
				 	unsigned src, dst;
			
@@ -803,17 +827,30 @@ static void load_bus_bandwidth_file_content(void)
 
				 		{
			
 
				 			double bandwidth;
			
 
				 
			
 
				-			n = fscanf(f, "%lf\t", &bandwidth);
			
 
				-			STARPU_ASSERT(n == 1);
			
 
				+			n = fscanf(f, "%lf", &bandwidth);
			
 
				+			if (n != 1) {
			
 
				+				fprintf(stderr,"didn't get a number\n");
			
 
				+				fclose(f);
			
 
				+				return 0;
			
 
				+			}
			
 
				+			n = getc(f);
			
 
				+			if (n != '\t') {
			
 
				+				fclose(f);
			
 
				+				return 0;
			
 
				+			}
			
 
				 
			
 
				 			bandwidth_matrix[src][dst] = bandwidth;
			
 
				 		}
			
 
				 
			
 
				-		n = fscanf(f, "\n");
			
 
				-		STARPU_ASSERT(n == 0);
			
 
				+		n = getc(f);
			
 
				+		if (n != '\n') {
			
 
				+			fclose(f);
			
 
				+			return 0;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	fclose(f);
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 static void write_bus_bandwidth_file_content(void)
			
@@ -858,7 +895,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 				time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
			
 
				                                 time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
			
 
				 				timing =time_src_to_ram + time_ram_to_dst;
			
 
				-				bandwidth = 1.0*SIZE/timing;
			
 
				+				bandwidth = 1.0*cuda_size/timing;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				                                 if (src > ncuda)
			
@@ -875,7 +912,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 			        bandwidth = 0.0;
			
 
				 			}
			
 
				 
			
 
				-			fprintf(f, "%lf\t", bandwidth);
			
 
				+			fprintf(f, "%f\t", bandwidth);
			
 
				 		}
			
 
				 
			
 
				 		fprintf(f, "\n");
			
@@ -900,13 +937,11 @@ static void load_bus_bandwidth_file(void)
 
				 	get_bandwidth_path(path, 256);
			
 
				 
			
 
				 	res = access(path, F_OK);
			
 
				-	if (res)
			
 
				+	if (res || !load_bus_bandwidth_file_content())
			
 
				 	{
			
 
				-		/* File does not exist yet */
			
 
				+		/* File does not exist yet or is bogus */
			
 
				 		generate_bus_bandwidth_file();
			
 
				 	}
			
 
				-
			
 
				-	load_bus_bandwidth_file_content();
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -961,17 +996,17 @@ static void check_bus_config_file()
 
				 
			
 
				                 // Checking if both configurations match
			
 
				                 if (read_cpus != ncpus) {
			
 
				-			fprintf(stderr, "Current configuration does not match the performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
			
 
				+			fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
			
 
				                         starpu_force_bus_sampling();
			
 
				 			fprintf(stderr, "done\n");
			
 
				                 }
			
 
				                 else if (read_cuda != ncuda) {
			
 
				-                        fprintf(stderr, "Current configuration does not match the performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
			
 
				+                        fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
			
 
				                         starpu_force_bus_sampling();
			
 
				 			fprintf(stderr, "done\n");
			
 
				                 }
			
 
				                 else if (read_opencl != nopencl) {
			
 
				-                        fprintf(stderr, "Current configuration does not match the performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
			
 
				+                        fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
			
 
				                         starpu_force_bus_sampling();
			
 
				 			fprintf(stderr, "done\n");
			
 
				                 }
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -282,7 +282,11 @@ static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *a
 
				 	strncat(path, model->symbol, maxlen);
			
 
				 	
			
 
				 	char hostname[32];
			
 
				-	gethostname(hostname, 32);
			
 
				+	char *forced_hostname = getenv("STARPU_HOSTNAME");
			
 
				+	if (forced_hostname && forced_hostname[0])
			
 
				+		snprintf(hostname, sizeof(hostname), forced_hostname);
			
 
				+	else
			
 
				+		gethostname(hostname, sizeof(hostname));
			
 
				 	strncat(path, ".", maxlen);
			
 
				 	strncat(path, hostname, maxlen);
			
 
				 	strncat(path, ".", maxlen);
			
@@ -326,7 +330,11 @@ static void get_model_path(struct starpu_perfmodel_t *model, char *path, size_t
 
				 	strncat(path, model->symbol, maxlen);
			
 
				 	
			
 
				 	char hostname[32];
			
 
				-	gethostname(hostname, 32);
			
 
				+	char *forced_hostname = getenv("STARPU_HOSTNAME");
			
 
				+	if (forced_hostname && forced_hostname[0])
			
 
				+		snprintf(hostname, sizeof(hostname), forced_hostname);
			
 
				+	else
			
 
				+		gethostname(hostname, sizeof(hostname));
			
 
				 	strncat(path, ".", maxlen);
			
 
				 	strncat(path, hostname, maxlen);
			
 
				 }
			
@@ -392,7 +400,7 @@ void _starpu_deinitialize_registered_performance_models(void)
 
				  * was loaded or not (this is very likely to have been already loaded). If the
			
 
				  * model was not loaded yet, we take the lock in write mode, and if the model
			
 
				  * is still not loaded once we have the lock, we do load it.  */
			
 
				-static void load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history)
			
 
				+void _starpu_load_history_based_model(struct starpu_perfmodel_t *model, unsigned scan_history)
			
 
				 {
			
 
				 
			
 
				 	STARPU_ASSERT(model);
			
@@ -496,15 +504,8 @@ int starpu_list_models(void)
 
				         dp = opendir(path);
			
 
				         if (dp != NULL) {
			
 
				                 while ((ep = readdir(dp))) {
			
 
				-#ifdef DT_REG
			
 
				-                        if (ep->d_type == DT_REG)
			
 
				-#else
			
 
				-			if (strcmp(ep->d_name, ".")
			
 
				-			 && strcmp(ep->d_name, ".."))
			
 
				-#endif
			
 
				-			{
			
 
				+                        if (strcmp(ep->d_name, ".") && strcmp(ep->d_name, ".."))
			
 
				                                 fprintf(stdout, "file: <%s>\n", ep->d_name);
			
 
				-                        }
			
 
				                 }
			
 
				                 closedir (dp);
			
 
				                 return 0;
			
@@ -608,12 +609,10 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel_t *mod
 
				 	size_t size = _starpu_job_get_data_size(j);
			
 
				 	struct starpu_regression_model_t *regmodel;
			
 
				 
			
 
				-	load_history_based_model(model, 0);
			
 
				-
			
 
				 	regmodel = &model->per_arch[arch].regression;
			
 
				 
			
 
				 	if (regmodel->valid)
			
 
				-		exp = regmodel->alpha*pow(size, regmodel->beta);
			
 
				+                exp = regmodel->alpha*pow((double)size, regmodel->beta);
			
 
				 
			
 
				 	return exp;
			
 
				 }
			
@@ -624,12 +623,10 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 	size_t size = _starpu_job_get_data_size(j);
			
 
				 	struct starpu_regression_model_t *regmodel;
			
 
				 
			
 
				-	load_history_based_model(model, 0);
			
 
				-
			
 
				 	regmodel = &model->per_arch[arch].regression;
			
 
				 
			
 
				 	if (regmodel->nl_valid)
			
 
				-		exp = regmodel->a*pow(size, regmodel->b) + regmodel->c;
			
 
				+		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
			
 
				 
			
 
				 	return exp;
			
 
				 }
			
@@ -641,19 +638,13 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model,
 
				 	struct starpu_history_entry_t *entry;
			
 
				 	struct starpu_htbl32_node_s *history;
			
 
				 
			
 
				-	load_history_based_model(model, 1);
			
 
				-
			
 
				-	if (STARPU_UNLIKELY(!j->footprint_is_computed))
			
 
				-		_starpu_compute_buffers_footprint(j);
			
 
				-		
			
 
				-	uint32_t key = j->footprint;
			
 
				+	uint32_t key = _starpu_compute_buffers_footprint(j);
			
 
				 
			
 
				 	per_arch_model = &model->per_arch[arch];
			
 
				 
			
 
				 	history = per_arch_model->history;
			
 
				 	if (!history)
			
 
				 		return -1.0;
			
 
				-    
			
 
				 
			
 
				 	PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
			
 
				 	entry = _starpu_htbl_search_32(history, key);
			
@@ -662,17 +653,15 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel_t *model,
 
				 	exp = entry?entry->mean:-1.0;
			
 
				 
			
 
				 	if (entry && entry->nsample < STARPU_CALIBRATION_MINIMUM)
			
 
				-	  {
			
 
				 		/* TODO: report differently if we've scheduled really enough
			
 
				 		 * of that task and the scheduler should perhaps put it aside */
			
 
				 		/* Not calibrated enough */
			
 
				 		return -1.0;
			
 
				-	  }
			
 
				 
			
 
				 	return exp;
			
 
				 }
			
 
				 
			
 
				-void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned cpuid __attribute__((unused)), double measured)
			
 
				+void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured)
			
 
				 {
			
 
				 	if (model)
			
 
				 	{
			
@@ -682,7 +671,7 @@ void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t
 
				 
			
 
				 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				 		{
			
 
				-			uint32_t key = j->footprint;
			
 
				+			uint32_t key = _starpu_compute_buffers_footprint(j);
			
 
				 			struct starpu_history_entry_t *entry;
			
 
				 
			
 
				 			struct starpu_htbl32_node_s *history;
			
@@ -738,7 +727,7 @@ void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t
 
				 			/* update the regression model */
			
 
				 			size_t job_size = _starpu_job_get_data_size(j);
			
 
				 			double logy, logx;
			
 
				-			logx = log(job_size);
			
 
				+			logx = log((double)job_size);
			
 
				 			logy = log(measured);
			
 
				 
			
 
				 			reg_model->sumlnx += logx;
			
@@ -767,7 +756,7 @@ void _starpu_update_perfmodel_history(starpu_job_t j, struct starpu_perfmodel_t
 
				 
			
 
				 		STARPU_ASSERT(j->footprint_is_computed);
			
 
				 
			
 
				-		fprintf(debug_file, "0x%x\t%lu\t%lf\t%lf\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(j), measured, task->predicted, cpuid);
			
 
				+		fprintf(debug_file, "0x%x\t%lu\t%f\t%f\t%d\t\t", j->footprint, (unsigned long) _starpu_job_get_data_size(j), measured, task->predicted, cpuid);
			
 
				 		unsigned i;
			
 
				 			
			
 
				 		for (i = 0; i < task->cl->nbuffers; i++)
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,8 +22,7 @@
 
				 #include <common/utils.h>
			
 
				 #include <core/sched_policy.h>
			
 
				 #include <profiling/profiling.h>
			
 
				-
			
 
				-//static struct starpu_sched_policy_s policy;
			
 
				+#include <common/barrier.h>
			
 
				 
			
 
				 static int use_prefetch = 0;
			
 
				 
			
@@ -38,7 +37,6 @@ int starpu_get_prefetch_flag(void)
 
				 
			
 
				 extern struct starpu_sched_policy_s _starpu_sched_ws_policy;
			
 
				 extern struct starpu_sched_policy_s _starpu_sched_prio_policy;
			
 
				-extern struct starpu_sched_policy_s _starpu_sched_no_prio_policy;
			
 
				 extern struct starpu_sched_policy_s _starpu_sched_random_policy;
			
 
				 extern struct starpu_sched_policy_s _starpu_sched_dm_policy;
			
 
				 extern struct starpu_sched_policy_s _starpu_sched_dmda_policy;
			
@@ -49,12 +47,9 @@ extern struct starpu_sched_policy_s _starpu_sched_parallel_heft_policy;
 
				 extern struct starpu_sched_policy_s _starpu_sched_pgreedy_policy;
			
 
				 extern struct starpu_sched_policy_s heft_policy;
			
 
				 
			
 
				-#define NPREDEFINED_POLICIES	12
			
 
				-
			
 
				-static struct starpu_sched_policy_s *predefined_policies[NPREDEFINED_POLICIES] = {
			
 
				+static struct starpu_sched_policy_s *predefined_policies[] = {
			
 
				 	&_starpu_sched_ws_policy,
			
 
				 	&_starpu_sched_prio_policy,
			
 
				-	&_starpu_sched_no_prio_policy,
			
 
				 	&_starpu_sched_dm_policy,
			
 
				 	&_starpu_sched_dmda_policy,
			
 
				 	&heft_policy,
			
@@ -94,9 +89,8 @@ static void load_sched_policy(struct starpu_sched_policy_s *sched_policy, struct
 
				 	policy->init_sched = sched_policy->init_sched;
			
 
				 	policy->deinit_sched = sched_policy->deinit_sched;
			
 
				 	policy->push_task = sched_policy->push_task;
			
 
				-	policy->push_prio_task = sched_policy->push_prio_task;
			
 
				 	policy->pop_task = sched_policy->pop_task;
			
 
				-        policy->post_exec_hook = sched_policy->post_exec_hook;
			
 
				+    policy->post_exec_hook = sched_policy->post_exec_hook;
			
 
				 	policy->pop_every_task = sched_policy->pop_every_task;
			
 
				 	policy->push_task_notify = sched_policy->push_task_notify;
			
 
				 	policy->policy_name = sched_policy->policy_name;
			
@@ -110,7 +104,7 @@ static struct starpu_sched_policy_s *find_sched_policy_from_name(const char *pol
 
				 		return NULL;
			
 
				 
			
 
				 	unsigned i;
			
 
				-	for (i = 0; i < NPREDEFINED_POLICIES; i++)
			
 
				+	for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
			
 
				 	{
			
 
				 		struct starpu_sched_policy_s *p;
			
 
				 		p = predefined_policies[i];
			
@@ -122,6 +116,7 @@ static struct starpu_sched_policy_s *find_sched_policy_from_name(const char *pol
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				+	fprintf(stderr, "Warning: scheduling policy \"%s\" was not found, try \"help\" to get a list\n", policy_name);
			
 
				 
			
 
				 	/* nothing was found */
			
 
				 	return NULL;
			
@@ -135,7 +130,7 @@ static void display_sched_help_message(void)
 
				 
			
 
				 		/* display the description of all predefined policies */
			
 
				 		unsigned i;
			
 
				-		for (i = 0; i < NPREDEFINED_POLICIES; i++)
			
 
				+		for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
			
 
				 		{
			
 
				 			struct starpu_sched_policy_s *p;
			
 
				 			p = predefined_policies[i];
			
@@ -187,7 +182,6 @@ void _starpu_init_sched_policy(struct starpu_machine_config_s *config, struct st
 
				 	use_prefetch = starpu_get_env_number("STARPU_PREFETCH");
			
 
				 	if (use_prefetch == -1)
			
 
				 		use_prefetch = 1;
			
 
				-  
			
 
				 
			
 
				 	/* By default, we don't calibrate */
			
 
				 	unsigned do_calibrate = 0;
			
@@ -228,8 +222,8 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
				 	int is_basic_worker = (workerid < nbasic_workers);
			
 
				 
			
 
				 	unsigned memory_node; 
			
 
				-	struct starpu_worker_s *worker;
			
 
				-	struct starpu_combined_worker_s *combined_worker;
			
 
				+	struct starpu_worker_s *worker = NULL;
			
 
				+	struct starpu_combined_worker_s *combined_worker = NULL;
			
 
				 
			
 
				 	if (is_basic_worker)
			
 
				 	{
			
@@ -447,3 +441,5 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int back)
 
				 
			
 
				 	return _starpu_push_local_task(worker, task, back);
			
 
				 }
			
 
				+
			
 
				+
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -246,6 +246,7 @@ int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx)
 
				                         _STARPU_LOG_OUT_TAG("ENODEV");
			
 
				 			return -ENODEV;
			
 
				                 }
			
 
				+		assert(task->cl->nbuffers <= STARPU_NMAXBUFS);
			
 
				 
			
 
				 		/* In case we require that a task should be explicitely
			
 
				 		 * executed on a specific worker, we make sure that the worker
			
@@ -256,6 +257,12 @@ int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx)
 
				                 }
			
 
				 
			
 
				 		_starpu_detect_implicit_data_deps(task);
			
 
				+
			
 
				+		if (task->cl->model)
			
 
				+			_starpu_load_perfmodel(task->cl->model);
			
 
				+
			
 
				+		if (task->cl->power_model)
			
 
				+			_starpu_load_perfmodel(task->cl->power_model);
			
 
				 	}
			
 
				 
			
 
				 	/* If profiling is activated, we allocate a structure to store the
			
@@ -268,7 +275,8 @@ int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx)
 
				 	/* The task is considered as block until we are sure there remains not
			
 
				 	 * dependency. */
			
 
				 	task->status = STARPU_TASK_BLOCKED;
			
 
				-	
			
 
				+
			
 
				+
			
 
				 	if (profiling)
			
 
				 		starpu_clock_gettime(&info->submit_time);
			
 
				 
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -269,7 +269,7 @@ unsigned _starpu_topology_get_nhwcpu(struct starpu_machine_config_s *config)
 
				 static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
			
 
				 				struct starpu_conf *user_conf)
			
 
				 {
			
 
				-	int explicitval __attribute__((unused));
			
 
				+	int explicitval STARPU_ATTRIBUTE_UNUSED;
			
 
				 	unsigned use_accelerator = 0;
			
 
				 
			
 
				 	int i;
			
@@ -440,23 +440,26 @@ static int _starpu_init_machine_config(struct starpu_machine_config_s *config,
 
				 /* we put the CPU section after the accelerator : in case there was an
			
 
				  * accelerator found, we devote one cpu */
			
 
				 #ifdef STARPU_USE_CPU
			
 
				-	explicitval = -1;
			
 
				 	if (user_conf && (user_conf->ncpus != -1)) {
			
 
				 		explicitval = user_conf->ncpus;
			
 
				 	}
			
 
				 	else {
			
 
				 		explicitval = starpu_get_env_number("STARPU_NCPUS");
			
 
				 	}
			
 
				+
			
 
				 	if (explicitval < 0) {
			
 
				-		unsigned already_busy_cpus = (topology->ngordon_spus?1:0) + topology->ncudagpus;
			
 
				+		unsigned already_busy_cpus = (topology->ngordon_spus?1:0) + topology->ncudagpus + topology->nopenclgpus;
			
 
				 		long avail_cpus = topology->nhwcpus - (use_accelerator?already_busy_cpus:0);
			
 
				-		topology->ncpus = STARPU_MIN(avail_cpus, STARPU_NMAXCPUS);
			
 
				+		if (avail_cpus < 0)
			
 
				+			avail_cpus = 0;
			
 
				+		topology->ncpus = STARPU_MIN(avail_cpus, STARPU_MAXCPUS);
			
 
				 	} else {
			
 
				 		/* use the specified value */
			
 
				 		topology->ncpus = (unsigned)explicitval;
			
 
				-		STARPU_ASSERT(topology->ncpus <= STARPU_NMAXCPUS);
			
 
				+		STARPU_ASSERT(topology->ncpus <= STARPU_MAXCPUS);
			
 
				 	}
			
 
				 	STARPU_ASSERT(topology->ncpus + topology->nworkers <= STARPU_NMAXWORKERS);
			
 
				+
			
 
				 	unsigned cpu;
			
 
				 	for (cpu = 0; cpu < topology->ncpus; cpu++)
			
 
				 	{
			
@@ -596,7 +599,7 @@ static inline int _starpu_get_next_bindid(struct starpu_machine_config_s *config
 
				 	return (int)topology->workers_bindid[i];
			
 
				 }
			
 
				 
			
 
				-void _starpu_bind_thread_on_cpu(struct starpu_machine_config_s *config __attribute__((unused)), unsigned cpuid)
			
 
				+void _starpu_bind_thread_on_cpu(struct starpu_machine_config_s *config STARPU_ATTRIBUTE_UNUSED, unsigned cpuid)
			
 
				 {
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	int ret;
			
@@ -649,7 +652,7 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 
				 
			
 
				 	/* note that even if the CPU cpu are not used, we always have a RAM node */
			
 
				 	/* TODO : support NUMA  ;) */
			
 
				-	ram_memory_node = _starpu_register_memory_node(STARPU_CPU_RAM);
			
 
				+	ram_memory_node = _starpu_register_memory_node(STARPU_CPU_RAM, -1);
			
 
				 
			
 
				 	/* We will store all the busid of the different (src, dst) combinations
			
 
				 	 * in a matrix which we initialize here. */
			
@@ -688,7 +691,7 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 
				 					npreferred = config->topology.nhwcpus;
			
 
				 				}
			
 
				 				is_a_set_of_accelerators = 0;
			
 
				-				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
			
 
				+				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM, workerarg->devid);
			
 
				 
			
 
				 				_starpu_register_bus(0, memory_node);
			
 
				 				_starpu_register_bus(memory_node, 0);
			
@@ -704,7 +707,7 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 
				 					npreferred = config->topology.nhwcpus;
			
 
				 				}
			
 
				 				is_a_set_of_accelerators = 0;
			
 
				-				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM);
			
 
				+				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM, workerarg->devid);
			
 
				 				_starpu_register_bus(0, memory_node);
			
 
				 				_starpu_register_bus(memory_node, 0);
			
 
				 				break;
			
@@ -774,6 +777,16 @@ void _starpu_destroy_topology(struct starpu_machine_config_s *config __attribute
 
				 	/* cleanup StarPU internal data structures */
			
 
				 	_starpu_deinit_memory_nodes();
			
 
				 
			
 
				+	unsigned worker;
			
 
				+	for (worker = 0; worker < config->topology.nworkers; worker++)
			
 
				+	{
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+		struct starpu_worker_s *workerarg = &config->workers[worker];
			
 
				+		hwloc_bitmap_free(workerarg->initial_hwloc_cpu_set);
			
 
				+		hwloc_bitmap_free(workerarg->current_hwloc_cpu_set);
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	hwloc_topology_destroy(config->topology.hwtopology);
			
 
				 #endif
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -155,7 +155,7 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 
				 
			
 
				 		workerarg->worker_size = 1;
			
 
				 		workerarg->combined_workerid = workerarg->workerid;
			
 
				-		workerarg->current_rank = 1;
			
 
				+		workerarg->current_rank = 0;
			
 
				 
			
 
				 		/* mutex + cond only for the local list */
			
 
				 		/* we have a single local list */
			
@@ -175,7 +175,7 @@ static void _starpu_launch_drivers(struct starpu_machine_config_s *config)
 
				 	
			
 
				 		workerarg->status = STATUS_INITIALIZING;
			
 
				 
			
 
				-		_STARPU_DEBUG("initialising worker %d\n", worker);
			
 
				+		_STARPU_DEBUG("initialising worker %u\n", worker);
			
 
				 
			
 
				 		_starpu_init_worker_queue(workerarg);
			
 
				 
			
@@ -297,8 +297,10 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 	conf->use_explicit_workers_cuda_gpuid = 0; /* TODO */
			
 
				 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
			
 
				 
			
 
				+	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
			
 
				+
			
 
				 	return 0;
			
 
				-};
			
 
				+}
			
 
				 
			
 
				 int starpu_init(struct starpu_conf *user_conf)
			
 
				 {
			
@@ -331,6 +333,8 @@ int starpu_init(struct starpu_conf *user_conf)
 
				 	
			
 
				 	_starpu_open_debug_logfile();
			
 
				 
			
 
				+	_starpu_data_interface_init();
			
 
				+
			
 
				 	_starpu_timing_init();
			
 
				 
			
 
				 	_starpu_profiling_init();
			
@@ -364,8 +368,6 @@ int starpu_init(struct starpu_conf *user_conf)
 
				 	else
			
 
				 	  _starpu_create_sched_ctx(user_conf->sched_policy_name, NULL, -1, 1, "init");
			
 
				 
			
 
				-	//_starpu_init_sched_policy(&config, &sched_ctx);
			
 
				-
			
 
				 	_starpu_initialize_registered_performance_models();
			
 
				 
			
 
				 	/* Launch "basic" workers (ie. non-combined workers) */
			
@@ -386,7 +388,7 @@ int starpu_init(struct starpu_conf *user_conf)
 
				 
			
 
				 static void _starpu_terminate_workers(struct starpu_machine_config_s *config)
			
 
				 {
			
 
				-	int status __attribute__((unused));
			
 
				+	int status STARPU_ATTRIBUTE_UNUSED;
			
 
				 	unsigned workerid;
			
 
				 
			
 
				 	for (workerid = 0; workerid < config->topology.nworkers; workerid++)
			
@@ -426,7 +428,7 @@ static void _starpu_terminate_workers(struct starpu_machine_config_s *config)
 
				 #endif
			
 
				 			}
			
 
				 		}
			
 
				-		//		worker->status = STATUS_JOINED;
			
 
				+
			
 
				 		STARPU_ASSERT(starpu_task_list_empty(&worker->local_tasks));
			
 
				 		starpu_job_list_delete(worker->terminated_jobs);
			
 
				 	}
			
@@ -437,7 +439,7 @@ unsigned _starpu_machine_is_running(void)
 
				 	return config.running;
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_worker_can_block(unsigned memnode __attribute__((unused)))
			
 
				+unsigned _starpu_worker_can_block(unsigned memnode STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 	return 0;
			
@@ -507,6 +509,8 @@ void starpu_shutdown(void)
 
				 	_starpu_stop_fxt_profiling();
			
 
				 #endif
			
 
				 
			
 
				+	_starpu_data_interface_shutdown();
			
 
				+
			
 
				 	_starpu_close_debug_logfile();
			
 
				 
			
 
				 	PTHREAD_MUTEX_LOCK(&init_mutex);
			
@@ -521,6 +525,27 @@ unsigned starpu_worker_get_count(void)
 
				 	return config.topology.nworkers;
			
 
				 }
			
 
				 
			
 
				+int starpu_worker_get_count_by_type(enum starpu_archtype type)
			
 
				+{
			
 
				+	switch (type)
			
 
				+	{
			
 
				+		case STARPU_CPU_WORKER:
			
 
				+			return config.topology.ncpus;
			
 
				+
			
 
				+		case STARPU_CUDA_WORKER:
			
 
				+			return config.topology.ncudagpus;
			
 
				+
			
 
				+		case STARPU_OPENCL_WORKER:
			
 
				+			return config.topology.nopenclgpus;
			
 
				+
			
 
				+		case STARPU_GORDON_WORKER:
			
 
				+			return config.topology.ngordon_spus;
			
 
				+
			
 
				+		default:
			
 
				+			return -EINVAL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 unsigned starpu_combined_worker_get_count(void)
			
 
				 {
			
 
				 	return config.topology.ncombinedworkers;
			
@@ -644,6 +669,28 @@ enum starpu_archtype starpu_worker_get_type(int id)
 
				 	return config.workers[id].arch;
			
 
				 }
			
 
				 
			
 
				+int starpu_worker_get_ids_by_type(enum starpu_archtype type, int *workerids, int maxsize)
			
 
				+{
			
 
				+	unsigned nworkers = starpu_worker_get_count();
			
 
				+
			
 
				+	int cnt = 0;
			
 
				+
			
 
				+	unsigned id;
			
 
				+	for (id = 0; id < nworkers; id++)
			
 
				+	{
			
 
				+		if (starpu_worker_get_type(id) == type)
			
 
				+		{
			
 
				+			/* Perhaps the array is too small ? */
			
 
				+			if (cnt >= maxsize)
			
 
				+				return -ERANGE;
			
 
				+
			
 
				+			workerids[cnt++] = id;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return cnt;
			
 
				+}
			
 
				+
			
 
				 void starpu_worker_get_name(int id, char *dst, size_t maxlen)
			
 
				 {
			
 
				 	char *name = config.workers[id].name;
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -76,7 +76,7 @@ struct starpu_worker_s {
 
				 	unsigned worker_is_running;
			
 
				 	unsigned worker_is_initialized;
			
 
				 	starpu_worker_status status; /* what is the worker doing now ? (eg. CALLBACK) */
			
 
				-	char name[32];
			
 
				+	char name[48];
			
 
				 
			
 
				 	struct starpu_sched_ctx **sched_ctx;
			
 
				 	unsigned nctxs; /* the no of contexts a worker belongs to*/
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -22,29 +22,6 @@
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				 #include <profiling/profiling.h>
			
 
				 
			
 
				-uint32_t _starpu_select_node_to_handle_request(uint32_t src_node, uint32_t dst_node) 
			
 
				-{
			
 
				-	/* in case one of the node is a GPU, it needs to perform the transfer,
			
 
				-	 * if both of them are GPU, it's a bit more complicated */
			
 
				-
			
 
				-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
			
 
				-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) == STARPU_OPENCL_RAM);
			
 
				-
			
 
				-	/* we do not handle GPU->GPU transfers yet ! */
			
 
				-	STARPU_ASSERT( !(src_is_a_gpu && dst_is_a_gpu) );
			
 
				-
			
 
				-	if (src_is_a_gpu)
			
 
				-		return src_node;
			
 
				-
			
 
				-	if (dst_is_a_gpu)
			
 
				-		return dst_node;
			
 
				-
			
 
				-	/* otherwise perform it locally, since we should be on a "sane" arch
			
 
				-	 * where anyone can do the transfer. NB: in StarPU this should actually never
			
 
				-	 * happen */
			
 
				-	return _starpu_get_local_memory_node();
			
 
				-}
			
 
				-
			
 
				 uint32_t _starpu_select_src_node(starpu_data_handle handle)
			
 
				 {
			
 
				 	unsigned src_node = 0;
			
@@ -77,11 +54,10 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
				 
			
 
				 			/* however GPU are expensive sources, really !
			
 
				 			 * 	other should be ok */
			
 
				-			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM)
			
 
				-				break;
			
 
				-			if (_starpu_get_node_kind(i) != STARPU_OPENCL_RAM)
			
 
				-				break;
			
 
				-
			
 
				+		 
			
 
				+			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM && _starpu_get_node_kind(i) != STARPU_OPENCL_RAM)	
			
 
				+				break ;
			
 
				+		 
			
 
				 			/* XXX do a better algorithm to distribute the memory copies */
			
 
				 			/* TODO : use the "requesting_node" as an argument to do so */
			
 
				 		}
			
@@ -103,7 +79,8 @@ void _starpu_update_data_state(starpu_data_handle handle,
 
				 	unsigned nnodes = _starpu_get_memory_nodes_count();
			
 
				 
			
 
				 	/* the data is present now */
			
 
				-	requesting_replicate->requested = 0;
			
 
				+	unsigned requesting_node = requesting_replicate->memory_node;
			
 
				+	requesting_replicate->requested[requesting_node] = 0;
			
 
				 
			
 
				 	if (mode & STARPU_W) {
			
 
				 		/* the requesting node now has the only valid copy */
			
@@ -129,6 +106,149 @@ void _starpu_update_data_state(starpu_data_handle handle,
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static int worker_supports_direct_access(unsigned node, unsigned handling_node)
			
 
				+{
			
 
				+	if (node == handling_node)
			
 
				+		return 1;
			
 
				+
			
 
				+	int type = _starpu_get_node_kind(node);
			
 
				+	switch (type)
			
 
				+	{
			
 
				+		case STARPU_CUDA_RAM:
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+			/* GPUs not always allow direct remote access: if CUDA4
			
 
				+			 * is enabled, we allow two CUDA devices to communicate. */
			
 
				+			return (_starpu_get_node_kind(handling_node) != STARPU_OPENCL_RAM);
			
 
				+#else
			
 
				+			/* Direct GPU-GPU transfers are not allowed in general */
			
 
				+			return 0;
			
 
				+#endif
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			return 0;
			
 
				+		default:
			
 
				+			return 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int link_supports_direct_transfers(starpu_data_handle handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
			
 
				+{
			
 
				+	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
			
 
				+	 * Perhaps not all data interface provide a direct GPU-GPU transfer
			
 
				+	 * method ! */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	if (src_node != dst_node && _starpu_get_node_kind(src_node) == STARPU_CUDA_RAM && _starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				+	{
			
 
				+		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
			
 
				+		if (!copy_methods->cuda_to_cuda_async)
			
 
				+			return 0;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	if (worker_supports_direct_access(src_node, dst_node))
			
 
				+	{
			
 
				+		*handling_node = dst_node;
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	if (worker_supports_direct_access(dst_node, src_node))
			
 
				+	{
			
 
				+		*handling_node = src_node;
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Determines the path of a request : each hop is defined by (src,dst) and the
			
 
				+ * node that handles the hop. The returned value indicates the number of hops,
			
 
				+ * and the max_len is the maximum number of hops (ie. the size of the
			
 
				+ * src_nodes, dst_nodes and handling_nodes arrays. */
			
 
				+static int determine_request_path(starpu_data_handle handle,
			
 
				+				unsigned src_node, unsigned dst_node,
			
 
				+				starpu_access_mode mode, int max_len,
			
 
				+				unsigned *src_nodes, unsigned *dst_nodes,
			
 
				+				unsigned *handling_nodes)
			
 
				+{
			
 
				+	if (!(mode & STARPU_R))
			
 
				+	{
			
 
				+		/* The destination node should only allocate the data, no transfer is required */
			
 
				+		STARPU_ASSERT(max_len >= 1);
			
 
				+		src_nodes[0] = 0; // ignored
			
 
				+		dst_nodes[0] = dst_node;
			
 
				+		handling_nodes[0] = dst_node;
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	unsigned handling_node;
			
 
				+	int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
			
 
				+
			
 
				+	if (!link_is_valid) {
			
 
				+		/* We need an intermediate hop to implement data staging
			
 
				+		 * through main memory. */
			
 
				+		STARPU_ASSERT(max_len >= 2);
			
 
				+
			
 
				+		/* XXX we hardcode 0 as the RAM node ... */
			
 
				+
			
 
				+		/* GPU -> RAM */
			
 
				+		src_nodes[0] = src_node;
			
 
				+		dst_nodes[0] = 0;
			
 
				+		handling_nodes[0] = src_node;
			
 
				+
			
 
				+		/* RAM -> GPU */
			
 
				+		src_nodes[1] = 0;
			
 
				+		dst_nodes[1] = dst_node;
			
 
				+		handling_nodes[1] = dst_node;
			
 
				+
			
 
				+		return 2;
			
 
				+	}
			
 
				+	else {
			
 
				+		STARPU_ASSERT(max_len >= 1);
			
 
				+		
			
 
				+		src_nodes[0] = src_node;
			
 
				+		dst_nodes[0] = dst_node;
			
 
				+		handling_nodes[0] = handling_node;
			
 
				+
			
 
				+#ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				+		STARPU_ASSERT(!(mode & STARPU_R) || _starpu_get_node_kind(src_node) != STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) != STARPU_CUDA_RAM);
			
 
				+#endif
			
 
				+
			
 
				+		return 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* handle->lock should be taken. r is returned locked. The node parameter
			
 
				+ * indicate either the source of the request, or the destination for a
			
 
				+ * write-only request. */
			
 
				+static starpu_data_request_t _starpu_search_existing_data_request(struct starpu_data_replicate_s *replicate, unsigned node, starpu_access_mode mode)
			
 
				+{
			
 
				+	starpu_data_request_t r;
			
 
				+
			
 
				+	r = replicate->request[node];
			
 
				+
			
 
				+	if (r)
			
 
				+	{
			
 
				+		_starpu_spin_lock(&r->lock);
			
 
				+
			
 
				+		/* perhaps we need to "upgrade" the request */
			
 
				+		if (mode & STARPU_R)
			
 
				+		{
			
 
				+			/* in case the exisiting request did not imply a memory
			
 
				+			 * transfer yet, we have to increment the refcnt now
			
 
				+			 * (so that the source remains valid) */
			
 
				+			if (!(r->mode & STARPU_R))
			
 
				+				replicate->refcnt++;
			
 
				+
			
 
				+			r->mode |= STARPU_R;
			
 
				+		}
			
 
				+
			
 
				+		if (mode & STARPU_W)
			
 
				+			r->mode |= STARPU_W;
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 
			
 
				 /*
			
 
				  * This function is called when the data is needed on the local node, this
			
@@ -151,14 +271,33 @@ void _starpu_update_data_state(starpu_data_handle handle,
 
				  */
			
 
				 
			
 
				 /* This function is called with handle's header lock taken */
			
 
				-static starpu_data_request_t create_new_request_to_fetch_data(starpu_data_handle handle,
			
 
				+starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
			
 
				 				struct starpu_data_replicate_s *dst_replicate,
			
 
				                                 starpu_access_mode mode, unsigned is_prefetch,
			
 
				                                 void (*callback_func)(void *), void *callback_arg)
			
 
				 {
			
 
				-	starpu_data_request_t r;
			
 
				 	unsigned requesting_node = dst_replicate->memory_node;
			
 
				 
			
 
				+	if (dst_replicate->state != STARPU_INVALID)
			
 
				+	{
			
 
				+		/* the data is already available so we can stop */
			
 
				+		_starpu_update_data_state(handle, dst_replicate, mode);
			
 
				+		_starpu_msi_cache_hit(requesting_node);
			
 
				+
			
 
				+		_starpu_spin_unlock(&handle->header_lock);
			
 
				+
			
 
				+		if (callback_func)
			
 
				+			callback_func(callback_arg);
			
 
				+
			
 
				+                _STARPU_LOG_OUT_TAG("data available");
			
 
				+		return NULL;
			
 
				+	}
			
 
				+
			
 
				+	_starpu_msi_cache_miss(requesting_node);
			
 
				+
			
 
				+	/* the only remaining situation is that the local copy was invalid */
			
 
				+	STARPU_ASSERT(dst_replicate->state == STARPU_INVALID);
			
 
				+
			
 
				 	/* find someone who already has the data */
			
 
				 	uint32_t src_node = 0;
			
 
				 
			
@@ -169,74 +308,82 @@ static starpu_data_request_t create_new_request_to_fetch_data(starpu_data_handle
 
				 		STARPU_ASSERT(src_node != requesting_node);
			
 
				 	}
			
 
				 
			
 
				-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
			
 
				-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(requesting_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(requesting_node) == STARPU_OPENCL_RAM);
			
 
				-
			
 
				-	struct starpu_data_replicate_s *src_replicate = &handle->per_node[src_node];
			
 
				+	/* We can safely assume that there won't be more than 2 hops in the
			
 
				+	 * current implementation */
			
 
				+	unsigned src_nodes[4], dst_nodes[4], handling_nodes[4];
			
 
				+	int nhops = determine_request_path(handle, src_node, requesting_node, mode, 4,
			
 
				+					src_nodes, dst_nodes, handling_nodes);
			
 
				+	STARPU_ASSERT(nhops <= 4);
			
 
				 
			
 
				-	/* we have to perform 2 successive requests for GPU->GPU transfers */
			
 
				-	if ((mode & STARPU_R) && (src_is_a_gpu && dst_is_a_gpu)) {
			
 
				-		unsigned reuse_r_src_to_ram;
			
 
				-		starpu_data_request_t r_src_to_ram;
			
 
				-		starpu_data_request_t r_ram_to_dst;
			
 
				+	starpu_data_request_t requests[nhops];
			
 
				 
			
 
				-		struct starpu_data_replicate_s *ram_replicate = &handle->per_node[0];
			
 
				+	/* Did we reuse a request for that hop ? */
			
 
				+	int reused_requests[nhops];
			
 
				 
			
 
				-		/* XXX we hardcore 0 as the RAM node ... */
			
 
				-		/* We put a 1 in the number of dependencies because this
			
 
				-		 * depends on the r_src_to_ram request. */
			
 
				-		r_ram_to_dst = _starpu_create_data_request(handle, ram_replicate,
			
 
				-					dst_replicate, requesting_node, mode, 1, is_prefetch);
			
 
				+	/* Construct an array with a list of requests, possibly reusing existing requests */
			
 
				+	int hop;
			
 
				+	for (hop = 0; hop < nhops; hop++)
			
 
				+	{
			
 
				+		starpu_data_request_t r;
			
 
				 
			
 
				-		if (!is_prefetch)
			
 
				-			r_ram_to_dst->refcnt++;
			
 
				+		unsigned hop_src_node = src_nodes[hop];
			
 
				+		unsigned hop_dst_node = dst_nodes[hop];
			
 
				+		unsigned hop_handling_node = handling_nodes[hop];
			
 
				 
			
 
				-		r_src_to_ram = _starpu_search_existing_data_request(ram_replicate, mode);
			
 
				+		struct starpu_data_replicate_s *hop_src_replicate;
			
 
				+		struct starpu_data_replicate_s *hop_dst_replicate;
			
 
				 
			
 
				-		reuse_r_src_to_ram = r_src_to_ram?1:0;
			
 
				+		/* Only the first request is independant */
			
 
				+		unsigned ndeps = (hop == 0)?0:1;
			
 
				 
			
 
				-		if (!r_src_to_ram)
			
 
				-		{
			
 
				-			r_src_to_ram = _starpu_create_data_request(handle, src_replicate,
			
 
				-						ram_replicate, src_node, mode, 0, is_prefetch);
			
 
				-		}
			
 
				+		hop_src_replicate = &handle->per_node[hop_src_node];
			
 
				+		hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
			
 
				 
			
 
				-		/* we chain both requests */
			
 
				-		r_src_to_ram->next_req[r_src_to_ram->next_req_count++]= r_ram_to_dst;
			
 
				+		/* Try to reuse a request if possible */
			
 
				+		r = _starpu_search_existing_data_request(hop_dst_replicate,
			
 
				+				(mode & STARPU_R)?hop_src_node:hop_dst_node, mode);
			
 
				 
			
 
				-		_starpu_data_request_append_callback(r_ram_to_dst, callback_func, callback_arg);
			
 
				+		reused_requests[hop] = !!r;
			
 
				 
			
 
				-		if (reuse_r_src_to_ram)
			
 
				-			_starpu_spin_unlock(&r_src_to_ram->lock);
			
 
				+		if (!r) {
			
 
				+			/* Create a new request if there was no request to reuse */
			
 
				+			r = _starpu_create_data_request(handle, hop_src_replicate,
			
 
				+					hop_dst_replicate, hop_handling_node,
			
 
				+					mode, ndeps);
			
 
				+		}
			
 
				 
			
 
				-		_starpu_spin_unlock(&handle->header_lock);
			
 
				+		requests[hop] = r; 
			
 
				+	}
			
 
				 
			
 
				-		/* we only submit the first request, the remaining will be automatically submitted afterward */
			
 
				-		if (!reuse_r_src_to_ram)
			
 
				-			_starpu_post_data_request(r_src_to_ram, src_node);
			
 
				+	/* Chain these requests */
			
 
				+	for (hop = 0; hop < nhops; hop++)
			
 
				+	{
			
 
				+		starpu_data_request_t r;
			
 
				+		r = requests[hop];
			
 
				 
			
 
				-		/* the application only waits for the termination of the last request */
			
 
				-		r = r_ram_to_dst;
			
 
				-	}
			
 
				-	else {
			
 
				-		/* who will perform that request ? */
			
 
				-		uint32_t handling_node =
			
 
				-			_starpu_select_node_to_handle_request(src_node, requesting_node);
			
 
				+		if (hop != nhops - 1)
			
 
				+		{
			
 
				+			if (!reused_requests[hop + 1])
			
 
				+				r->next_req[r->next_req_count++] = requests[hop + 1];
			
 
				+		}
			
 
				+		else
			
 
				+			_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
 
				 
			
 
				-		r = _starpu_create_data_request(handle, src_replicate,
			
 
				-				dst_replicate, handling_node, mode, 0, is_prefetch);
			
 
				 
			
 
				-		_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
 
				+		if (reused_requests[hop])
			
 
				+			_starpu_spin_unlock(&r->lock);
			
 
				+	}
			
 
				 
			
 
				-		if (!is_prefetch)
			
 
				-			r->refcnt++;
			
 
				+	if (!is_prefetch)
			
 
				+		requests[nhops - 1]->refcnt++;
			
 
				 
			
 
				-		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				-		_starpu_post_data_request(r, handling_node);
			
 
				-	}
			
 
				+	/* we only submit the first request, the remaining will be
			
 
				+	 * automatically submitted afterward */
			
 
				+	if (!reused_requests[0])
			
 
				+		_starpu_post_data_request(requests[0], handling_nodes[0]);
			
 
				 
			
 
				-	return r;
			
 
				+	return requests[nhops - 1];
			
 
				 }
			
 
				 
			
 
				 int _starpu_fetch_data_on_node(starpu_data_handle handle, struct starpu_data_replicate_s *dst_replicate,
			
@@ -246,71 +393,23 @@ int _starpu_fetch_data_on_node(starpu_data_handle handle, struct starpu_data_rep
 
				 	uint32_t local_node = _starpu_get_local_memory_node();
			
 
				         _STARPU_LOG_IN();
			
 
				 
			
 
				-	unsigned requesting_node = dst_replicate->memory_node;
			
 
				-
			
 
				 	while (_starpu_spin_trylock(&handle->header_lock))
			
 
				 		_starpu_datawizard_progress(local_node, 1);
			
 
				 
			
 
				 	if (!is_prefetch)
			
 
				 		dst_replicate->refcnt++;
			
 
				 
			
 
				-	if (dst_replicate->state != STARPU_INVALID)
			
 
				-	{
			
 
				-		/* the data is already available so we can stop */
			
 
				-		_starpu_update_data_state(handle, dst_replicate, mode);
			
 
				-		_starpu_msi_cache_hit(requesting_node);
			
 
				-		_starpu_spin_unlock(&handle->header_lock);
			
 
				-
			
 
				-		if (callback_func)
			
 
				-			callback_func(callback_arg);
			
 
				-
			
 
				-                _STARPU_LOG_OUT_TAG("data available");
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				-	/* the only remaining situation is that the local copy was invalid */
			
 
				-	STARPU_ASSERT(dst_replicate->state == STARPU_INVALID);
			
 
				-
			
 
				-	_starpu_msi_cache_miss(requesting_node);
			
 
				-
			
 
				 	starpu_data_request_t r;
			
 
				+	r = create_request_to_fetch_data(handle, dst_replicate, mode,
			
 
				+					is_prefetch, callback_func, callback_arg);
			
 
				 
			
 
				-	/* is there already a pending request ? */
			
 
				-	r = _starpu_search_existing_data_request(dst_replicate, mode);
			
 
				-	/* at the exit of _starpu_search_existing_data_request the lock is taken if the request existed ! */
			
 
				-
			
 
				-	if (!r) {
			
 
				-		r = create_new_request_to_fetch_data(handle, dst_replicate, mode, is_prefetch, callback_func, callback_arg);
			
 
				-	}
			
 
				-	else {
			
 
				-		/* the lock was taken by _starpu_search_existing_data_request */
			
 
				-		_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
 
				-
			
 
				-		/* there is already a similar request */
			
 
				-		if (is_prefetch)
			
 
				-		{
			
 
				-			_starpu_spin_unlock(&r->lock);
			
 
				-			_starpu_spin_unlock(&handle->header_lock);
			
 
				-
			
 
				-                        _STARPU_LOG_OUT_TAG("similar request");
			
 
				-                        return 0;
			
 
				-		}
			
 
				-
			
 
				-		r->refcnt++;
			
 
				-
			
 
				-		//_starpu_spin_lock(&r->lock);
			
 
				-		if (r->is_a_prefetch_request)
			
 
				-		{
			
 
				-			/* transform that prefetch request into a "normal" request */
			
 
				-			r->is_a_prefetch_request = 0;
			
 
				-
			
 
				-			/* transform that request into the proper access mode (prefetch could be read only) */
			
 
				-			r->mode |= mode;
			
 
				-		}
			
 
				-
			
 
				-		_starpu_spin_unlock(&r->lock);
			
 
				-		_starpu_spin_unlock(&handle->header_lock);
			
 
				-	}
			
 
				+	/* If no request was created, the handle was already up-to-date on the
			
 
				+	 * node. In this case, create_request_to_fetch_data has already
			
 
				+	 * unlocked the header. */
			
 
				+	if (!r)
			
 
				+		return 0;
			
 
				+	
			
 
				+	_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 	int ret = is_prefetch?0:_starpu_wait_data_request_completion(r, 1);
			
 
				         _STARPU_LOG_OUT();
			
@@ -384,7 +483,10 @@ static void _starpu_set_data_requested_flag_if_needed(struct starpu_data_replica
 
				 //	pthread_spin_lock(&handle->header_lock);
			
 
				 
			
 
				 	if (replicate->state == STARPU_INVALID) 
			
 
				-		replicate->requested = 1;
			
 
				+	{
			
 
				+		unsigned dst_node = replicate->memory_node;
			
 
				+		replicate->requested[dst_node] = 1;
			
 
				+	}
			
 
				 
			
 
				 //	pthread_spin_unlock(&handle->header_lock);
			
 
				 }
			
@@ -449,7 +551,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, uint32_t mask)
 
				 		if (STARPU_UNLIKELY(ret))
			
 
				 			goto enomem;
			
 
				 
			
 
				-		task->interface[index] = local_replicate->data_interface;
			
 
				+		task->interfaces[index] = local_replicate->data_interface;
			
 
				 
			
 
				 		if (mode & STARPU_REDUX)
			
 
				 		{
			
@@ -531,9 +633,21 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle handle, uint32_
 
				 // XXX : this is just a hint, so we don't take the lock ...
			
 
				 //	pthread_spin_lock(&handle->header_lock);
			
 
				 
			
 
				-	if (handle->per_node[node].state != STARPU_INVALID 
			
 
				-		|| handle->per_node[node].requested || handle->per_node[node].request)
			
 
				-		ret = 1;
			
 
				+	if (handle->per_node[node].state != STARPU_INVALID)
			
 
				+	{
			
 
				+		ret  = 1;
			
 
				+	}
			
 
				+	else {
			
 
				+		unsigned i;
			
 
				+		unsigned nnodes = _starpu_get_memory_nodes_count();
			
 
				+
			
 
				+		for (i = 0; i < nnodes; i++)
			
 
				+		{
			
 
				+			if (handle->per_node[node].requested[i] || handle->per_node[node].request[i])
			
 
				+				ret = 1;
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				 
			
 
				 //	pthread_spin_unlock(&handle->header_lock);
			
 
				 
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -74,8 +74,8 @@ LIST_TYPE(starpu_data_replicate,
 
				 	   flag when it assigns a task to a queue, policies which do not
			
 
				 	   use this hint can simply ignore it.
			
 
				 	 */
			
 
				-	uint8_t requested;
			
 
				-	struct starpu_data_request_s *request;
			
 
				+	uint8_t requested[STARPU_MAXNODES];
			
 
				+	struct starpu_data_request_s *request[STARPU_MAXNODES];
			
 
				 );
			
 
				 
			
 
				 struct starpu_data_requester_list_s;
			
@@ -189,6 +189,7 @@ struct starpu_data_state_t {
 
				 
			
 
				         /* Used for MPI */
			
 
				         int rank;
			
 
				+	int tag;
			
 
				 };
			
 
				 
			
 
				 void _starpu_display_msi_stats(void);
			
@@ -218,9 +219,13 @@ unsigned _starpu_is_data_present_or_requested(struct starpu_data_state_t *state,
 
				 unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle handle, uint32_t memory_node);
			
 
				 
			
 
				 
			
 
				-uint32_t _starpu_select_node_to_handle_request(uint32_t src_node, uint32_t dst_node);
			
 
				 uint32_t _starpu_select_src_node(struct starpu_data_state_t *state);
			
 
				 
			
 
				+starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
			
 
				+				struct starpu_data_replicate_s *dst_replicate,
			
 
				+                                starpu_access_mode mode, unsigned is_prefetch,
			
 
				+                                void (*callback_func)(void *), void *callback_arg);
			
 
				+
			
 
				 void _starpu_redux_init_data_replicate(starpu_data_handle handle, struct starpu_data_replicate_s *replicate, int workerid);
			
 
				 void starpu_data_start_reduction_mode(starpu_data_handle handle);
			
 
				 void starpu_data_end_reduction_mode(starpu_data_handle handle);
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -82,7 +82,7 @@ void starpu_wake_all_blocked_workers(void)
 
				 static unsigned communication_cnt = 0;
			
 
				 #endif
			
 
				 
			
 
				-static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_data_replicate_s *src_replicate, struct starpu_data_replicate_s *dst_replicate, struct starpu_data_request_s *req __attribute__((unused)))
			
 
				+static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_data_replicate_s *src_replicate, struct starpu_data_replicate_s *dst_replicate, struct starpu_data_request_s *req STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	int ret = 0;
			
 
				 
			
@@ -108,6 +108,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 	void *src_interface = src_replicate->data_interface;
			
 
				 	void *dst_interface = dst_replicate->data_interface;
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
			
 
				+	{
			
 
				+		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
			
 
				+		cures = cudaSetDevice(starpu_memory_node_to_devid(node));
			
 
				+		STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) {
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
			
 
				 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
			
@@ -116,29 +125,22 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 		break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
			
 
				-		/* CUBLAS_RAM -> STARPU_CPU_RAM */
			
 
				-		/* only the proper CUBLAS thread can initiate this ! */
			
 
				-		if (_starpu_get_local_memory_node() == src_node) {
			
 
				-			/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				-			STARPU_ASSERT(copy_methods->cuda_to_ram);
			
 
				-			if (!req || !copy_methods->cuda_to_ram_async) {
			
 
				-				/* this is not associated to a request so it's synchronous */
			
 
				-				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				-			}
			
 
				-			else {
			
 
				-				cures = cudaEventCreate(&req->async_channel.cuda_event);
			
 
				-				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-				stream = starpu_cuda_get_local_transfer_stream();
			
 
				-				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				-
			
 
				-				cures = cudaEventRecord(req->async_channel.cuda_event, stream);
			
 
				-				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-			}
			
 
				+		/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				+		STARPU_ASSERT(copy_methods->cuda_to_ram);
			
 
				+		if (!req || !copy_methods->cuda_to_ram_async) {
			
 
				+			/* this is not associated to a request so it's synchronous */
			
 
				+			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else {
			
 
				-			/* we should not have a blocking call ! */
			
 
				-			STARPU_ABORT();
			
 
				+			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			stream = starpu_cuda_get_local_transfer_stream();
			
 
				+			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+
			
 
				+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 		}
			
 
				 		break;
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
			
@@ -151,13 +153,35 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else {
			
 
				-			cures = cudaEventCreate(&req->async_channel.cuda_event);
			
 
				-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			stream = starpu_cuda_get_local_stream();
			
 
				 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 
			
 
				-			cures = cudaEventRecord(req->async_channel.cuda_event, stream);
			
 
				+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+		}
			
 
				+		break;
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
 
				+		/* CUDA - CUDA transfer */
			
 
				+		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
			
 
				+		if (!req || !copy_methods->cuda_to_cuda_async) {
			
 
				+			/* this is not associated to a request so it's synchronous */
			
 
				+			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				+		}
			
 
				+		else {
			
 
				+			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			stream = starpu_cuda_get_local_stream();
			
 
				+			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+
			
 
				+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 		}
			
 
				 		break;
			
@@ -172,7 +196,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				 			}
			
 
				 			else {
			
 
				-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.opencl_event));
			
 
				+				req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				 			}
			
 
				 		}
			
 
				 		else {
			
@@ -189,7 +214,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else {
			
 
				-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.opencl_event));
			
 
				+			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				+			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				 		}
			
 
				 		break;
			
 
				 #endif
			
@@ -215,7 +241,7 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	}
			
 
				 
			
 
				 	int ret_alloc, ret_copy;
			
 
				-	unsigned __attribute__((unused)) com_id = 0;
			
 
				+	unsigned STARPU_ATTRIBUTE_UNUSED com_id = 0;
			
 
				 
			
 
				 	unsigned src_node = src_replicate->memory_node;
			
 
				 	unsigned dst_node = dst_replicate->memory_node;
			
@@ -263,10 +289,9 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel __attribute__ ((unused)),
			
 
				-					unsigned handling_node)
			
 
				+void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel)
			
 
				 {
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(handling_node);
			
 
				+	starpu_node_kind kind = async_channel->type;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaEvent_t event;
			
 
				 	cudaError_t cures;
			
@@ -275,7 +300,7 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
				 	switch (kind) {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			event = (*async_channel).cuda_event;
			
 
				+			event = (*async_channel).event.cuda_event;
			
 
				 
			
 
				 			cures = cudaEventSynchronize(event);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
@@ -290,10 +315,10 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
				 #ifdef STARPU_USE_OPENCL
			
 
				       case STARPU_OPENCL_RAM:
			
 
				          {
			
 
				-                 if ((*async_channel).opencl_event == NULL) STARPU_ABORT();
			
 
				-                 cl_int err = clWaitForEvents(1, &((*async_channel).opencl_event));
			
 
				+                 if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT();
			
 
				+                 cl_int err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
			
 
				                  if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                 clReleaseEvent((*async_channel).opencl_event);
			
 
				+                 clReleaseEvent((*async_channel).event.opencl_event);
			
 
				          }
			
 
				          break;
			
 
				 #endif
			
@@ -303,10 +328,9 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
				 	}
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_channel __attribute__ ((unused)),
			
 
				-					unsigned handling_node)
			
 
				+unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel)
			
 
				 {
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(handling_node);
			
 
				+	starpu_node_kind kind = async_channel->type;
			
 
				 	unsigned success;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaEvent_t event;
			
@@ -315,11 +339,14 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 
				 	switch (kind) {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			event = (*async_channel).cuda_event;
			
 
				+			event = (*async_channel).event.cuda_event;
			
 
				+			CUresult cures = cudaEventQuery(event);
			
 
				 
			
 
				-			success = (cudaEventQuery(event) == cudaSuccess);
			
 
				+			success = (cures == cudaSuccess);
			
 
				 			if (success)
			
 
				 				cudaEventDestroy(event);
			
 
				+			else if (cures != cudaErrorNotReady)
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			break;
			
 
				 #endif
			
@@ -327,7 +354,7 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 
				       case STARPU_OPENCL_RAM:
			
 
				          {
			
 
				             cl_int event_status;
			
 
				-            cl_event opencl_event = (*async_channel).opencl_event;
			
 
				+            cl_event opencl_event = (*async_channel).event.opencl_event;
			
 
				             if (opencl_event == NULL) STARPU_ABORT();
			
 
				             cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
			
 
				             if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -46,7 +46,12 @@ typedef union {
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         cl_event opencl_event;
			
 
				 #endif
			
 
				-} starpu_async_channel;
			
 
				+} starpu_async_channel_event;
			
 
				+
			
 
				+struct starpu_async_channel {
			
 
				+	starpu_async_channel_event event;
			
 
				+	starpu_node_kind type;
			
 
				+};
			
 
				 
			
 
				 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
			
 
				 
			
@@ -57,6 +62,6 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle handle,
 
				 					struct starpu_data_request_s *req,
			
 
				 					unsigned may_alloc);
			
 
				 
			
 
				-unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_channel, unsigned handling_node);
			
 
				-void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel, unsigned handling_node);
			
 
				+unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel);
			
 
				+void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel);
			
 
				 #endif // __COPY_DRIVER_H__
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -62,8 +62,22 @@ void _starpu_deinit_data_request_lists(void)
 
				 /* this should be called with the lock r->handle->header_lock taken */
			
 
				 static void starpu_data_request_destroy(starpu_data_request_t r)
			
 
				 {
			
 
				-	STARPU_ASSERT(r->dst_replicate->request == r);
			
 
				-	r->dst_replicate->request = NULL;
			
 
				+	unsigned node;
			
 
				+
			
 
				+	/* If this is a write only request, then there is no source and we use
			
 
				+	 * the destination node to cache the request. Otherwise we store the
			
 
				+	 * pending requests between src and dst. */
			
 
				+	if (r->mode & STARPU_R)
			
 
				+	{
			
 
				+		node = r->src_replicate->memory_node;
			
 
				+	}
			
 
				+	else {
			
 
				+		node = r->dst_replicate->memory_node;
			
 
				+	}
			
 
				+
			
 
				+	STARPU_ASSERT(r->dst_replicate->request[node] == r);
			
 
				+	r->dst_replicate->request[node] = NULL;
			
 
				+	//fprintf(stderr, "DESTROY REQ %p (%d) refcnt %d\n", r, node, r->refcnt);
			
 
				 	starpu_data_request_delete(r);
			
 
				 }
			
 
				 
			
@@ -73,8 +87,7 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 
				 				struct starpu_data_replicate_s *dst_replicate,
			
 
				 				uint32_t handling_node,
			
 
				 				starpu_access_mode mode,
			
 
				-				unsigned ndeps,
			
 
				-				unsigned is_prefetch)
			
 
				+				unsigned ndeps)
			
 
				 {
			
 
				 	starpu_data_request_t r = starpu_data_request_new();
			
 
				 
			
@@ -90,15 +103,21 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 
				 	r->ndeps = ndeps;
			
 
				 	r->next_req_count = 0;
			
 
				 	r->callbacks = NULL;
			
 
				-	r->is_a_prefetch_request = is_prefetch;
			
 
				 
			
 
				 	_starpu_spin_lock(&r->lock);
			
 
				 
			
 
				-	dst_replicate->request = r;
			
 
				 	dst_replicate->refcnt++;
			
 
				 
			
 
				 	if (mode & STARPU_R)
			
 
				+	{
			
 
				+		unsigned src_node = src_replicate->memory_node;
			
 
				+		dst_replicate->request[src_node] = r;
			
 
				 		src_replicate->refcnt++;
			
 
				+	}
			
 
				+	else {
			
 
				+		unsigned dst_node = dst_replicate->memory_node;
			
 
				+		dst_replicate->request[dst_node] = r;
			
 
				+	}
			
 
				 
			
 
				 	r->refcnt = 1;
			
 
				 
			
@@ -107,34 +126,6 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				-/* handle->lock should be taken. r is returned locked */
			
 
				-starpu_data_request_t _starpu_search_existing_data_request(struct starpu_data_replicate_s *replicate, starpu_access_mode mode)
			
 
				-{
			
 
				-	starpu_data_request_t r = replicate->request;
			
 
				-
			
 
				-	if (r)
			
 
				-	{
			
 
				-		_starpu_spin_lock(&r->lock);
			
 
				-
			
 
				-		/* perhaps we need to "upgrade" the request */
			
 
				-		if (mode & STARPU_R)
			
 
				-		{
			
 
				-			/* in case the exisiting request did not imply a memory
			
 
				-			 * transfer yet, we have to increment the refcnt now
			
 
				-			 * (so that the source remains valid) */
			
 
				-			if (!(r->mode & STARPU_R))
			
 
				-				replicate->refcnt++;
			
 
				-
			
 
				-			r->mode |= STARPU_R;
			
 
				-		}
			
 
				-
			
 
				-		if (mode & STARPU_W)
			
 
				-			r->mode |= STARPU_W;
			
 
				-	}
			
 
				-
			
 
				-	return r;
			
 
				-}
			
 
				-
			
 
				 int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc)
			
 
				 {
			
 
				 	int retval;
			
@@ -421,11 +412,11 @@ static void _handle_pending_node_data_requests(uint32_t src_node, unsigned force
 
				 		/* wait until the transfer is terminated */
			
 
				 		if (force)
			
 
				 		{
			
 
				-			_starpu_driver_wait_request_completion(&r->async_channel, src_node);
			
 
				+			_starpu_driver_wait_request_completion(&r->async_channel);
			
 
				 			starpu_handle_data_request_completion(r);
			
 
				 		}
			
 
				 		else {
			
 
				-			if (_starpu_driver_test_request_completion(&r->async_channel, src_node))
			
 
				+			if (_starpu_driver_test_request_completion(&r->async_channel))
			
 
				 			{
			
 
				 				/* The request was completed */
			
 
				 				starpu_handle_data_request_completion(r);
			
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -44,7 +44,7 @@ LIST_TYPE(starpu_data_request,
 
				 
			
 
				 	starpu_access_mode mode;
			
 
				 
			
 
				-	starpu_async_channel async_channel;
			
 
				+	struct starpu_async_channel async_channel;
			
 
				 
			
 
				 	unsigned completed;
			
 
				 	int retval;
			
@@ -60,8 +60,6 @@ LIST_TYPE(starpu_data_request,
 
				 
			
 
				 	struct callback_list *callbacks;
			
 
				 
			
 
				-	unsigned is_a_prefetch_request;
			
 
				-
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	unsigned com_id;
			
 
				 #endif
			
@@ -102,10 +100,8 @@ starpu_data_request_t _starpu_create_data_request(starpu_data_handle handle,
 
				 				struct starpu_data_replicate_s *dst_replicate,
			
 
				 				uint32_t handling_node,
			
 
				 				starpu_access_mode mode,
			
 
				-				unsigned ndeps,
			
 
				-				unsigned is_prefetch);
			
 
				+				unsigned ndeps);
			
 
				 
			
 
				-starpu_data_request_t _starpu_search_existing_data_request(struct starpu_data_replicate_s *replicate, starpu_access_mode mode);
			
 
				 int _starpu_wait_data_request_completion(starpu_data_request_t r, unsigned may_alloc);
			
 
				 
			
 
				 void _starpu_data_request_append_callback(starpu_data_request_t r,
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -41,11 +41,9 @@ static void map_filter(starpu_data_handle root_handle, struct starpu_data_filter
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				-void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters, ...)
			
 
				+void starpu_data_vmap_filters(starpu_data_handle root_handle, unsigned nfilters, va_list pa)
			
 
				 {
			
 
				 	unsigned i;
			
 
				-	va_list pa;
			
 
				-	va_start(pa, nfilters);
			
 
				 	for (i = 0; i < nfilters; i++)
			
 
				 	{
			
 
				 		struct starpu_data_filter *next_filter;
			
@@ -55,6 +53,13 @@ void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters,
 
				 
			
 
				 		map_filter(root_handle, next_filter);
			
 
				 	}
			
 
				+}
			
 
				+
			
 
				+void starpu_data_map_filters(starpu_data_handle root_handle, unsigned nfilters, ...)
			
 
				+{
			
 
				+	va_list pa;
			
 
				+	va_start(pa, nfilters);
			
 
				+	starpu_data_vmap_filters(root_handle, nfilters, pa);
			
 
				 	va_end(pa);
			
 
				 }
			
 
				 
			
@@ -75,22 +80,30 @@ starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i)
 
				  */
			
 
				 starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_handle, unsigned depth, ... )
			
 
				 {
			
 
				+	va_list pa;
			
 
				+	va_start(pa, depth);
			
 
				+	starpu_data_handle handle = starpu_data_vget_sub_data(root_handle, depth, pa);
			
 
				+	va_end(pa);
			
 
				+
			
 
				+	return handle;
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle starpu_data_vget_sub_data(starpu_data_handle root_handle, unsigned depth, va_list pa )
			
 
				+{
			
 
				 	STARPU_ASSERT(root_handle);
			
 
				 	starpu_data_handle current_handle = root_handle;
			
 
				 
			
 
				 	/* the variable number of argument must correlate the depth in the tree */
			
 
				 	unsigned i; 
			
 
				-	va_list pa;
			
 
				-	va_start(pa, depth);
			
 
				 	for (i = 0; i < depth; i++)
			
 
				 	{
			
 
				 		unsigned next_child;
			
 
				 		next_child = va_arg(pa, unsigned);
			
 
				+
			
 
				 		STARPU_ASSERT(next_child < current_handle->nchildren);
			
 
				 
			
 
				 		current_handle = &current_handle->children[next_child];
			
 
				 	}
			
 
				-	va_end(pa);
			
 
				 
			
 
				 	return current_handle;
			
 
				 }
			
@@ -142,6 +155,7 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 		child->req_list = starpu_data_requester_list_new();
			
 
				 		child->reduction_req_list = starpu_data_requester_list_new();
			
 
				 		child->refcnt = 0;
			
 
				+		child->reduction_refcnt = 0;
			
 
				 		_starpu_spin_init(&child->header_lock);
			
 
				 
			
 
				 		child->sequential_consistency = initial_handle->sequential_consistency;
			
@@ -158,9 +172,6 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 		child->redux_cl = initial_handle->redux_cl;
			
 
				 		child->init_cl = initial_handle->init_cl;
			
 
				 
			
 
				-		child->reduction_refcnt = 0;
			
 
				-		child->reduction_req_list = starpu_data_requester_list_new();
			
 
				-
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 		child->last_submitted_ghost_writer_id_is_valid = 0;
			
 
				 		child->last_submitted_ghost_writer_id = 0;
			
@@ -201,8 +212,13 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 			child_replicate->automatically_allocated = 0;
			
 
				 			child_replicate->refcnt = 0;
			
 
				 			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
			
 
				-			child_replicate->requested = 0;
			
 
				-			child_replicate->request = NULL;
			
 
				+
			
 
				+			for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+			{
			
 
				+				child_replicate->requested[node] = 0;
			
 
				+				child_replicate->request[node] = NULL;
			
 
				+			}
			
 
				+
			
 
				 			child_replicate->relaxed_coherency = 1;
			
 
				 			child_replicate->initialized = 0;
			
 
				 
			
@@ -214,6 +230,13 @@ void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data
 
				 		 * store it in the handle */
			
 
				 		child->data_size = child->ops->get_size(child);
			
 
				 		child->footprint = _starpu_compute_data_footprint(child);
			
 
				+
			
 
				+		void *ptr;
			
 
				+		ptr = starpu_handle_to_pointer(child, 0);
			
 
				+		if (ptr != NULL)
			
 
				+		{
			
 
				+			_starpu_data_register_ram_pointer(child, ptr);
			
 
				+		}
			
 
				 	}
			
 
				 	/* now let the header */
			
 
				 	_starpu_spin_unlock(&initial_handle->header_lock);
			
@@ -244,6 +267,8 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 
				 		STARPU_ASSERT(ret == 0); 
			
 
				 
			
 
				 		_starpu_data_free_interfaces(&root_handle->children[child]);
			
 
				+		starpu_data_requester_list_delete(child_handle->req_list);
			
 
				+		starpu_data_requester_list_delete(child_handle->reduction_req_list);
			
 
				 	}
			
 
				 
			
 
				 	/* the gathering_node should now have a valid copy of all the children.
			
@@ -279,7 +304,9 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 
				 				_starpu_request_mem_chunk_removal(root_handle, node);
			
 
				 				isvalid = 0; 
			
 
				 			}
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning free the data replicate if needed
			
 
				+#endif
			
 
				 
			
 
				 		}
			
 
				 
			
@@ -300,6 +327,7 @@ void starpu_data_unpartition(starpu_data_handle root_handle, uint32_t gathering_
 
				 	}
			
 
				 
			
 
				 	/* there is no child anymore */
			
 
				+	//free(root_handle->children);
			
 
				 	root_handle->nchildren = 0;
			
 
				 
			
 
				 	/* now the parent may be used again so we release the lock */
			
--- a/src/datawizard/footprint.c
+++ b/src/datawizard/footprint.c
@@ -18,8 +18,11 @@
 
				 #include <datawizard/footprint.h>
			
 
				 #include <common/hash.h>
			
 
				 
			
 
				-void _starpu_compute_buffers_footprint(starpu_job_t j)
			
 
				+uint32_t _starpu_compute_buffers_footprint(starpu_job_t j)
			
 
				 {
			
 
				+	if (j->footprint_is_computed)
			
 
				+		return j->footprint;
			
 
				+
			
 
				 	uint32_t footprint = 0;
			
 
				 	unsigned buffer;
			
 
				 
			
@@ -36,6 +39,8 @@ void _starpu_compute_buffers_footprint(starpu_job_t j)
 
				 
			
 
				 	j->footprint = footprint;
			
 
				 	j->footprint_is_computed = 1;
			
 
				+
			
 
				+	return footprint;
			
 
				 }
			
 
				 
			
 
				 inline uint32_t _starpu_compute_data_footprint(starpu_data_handle handle)
			
--- a/src/datawizard/footprint.h
+++ b/src/datawizard/footprint.h
@@ -24,7 +24,7 @@
 
				 
			
 
				 /* Compute the footprint that characterizes the job and cache it into the job
			
 
				  * structure. */
			
 
				-void _starpu_compute_buffers_footprint(struct starpu_job_s *j);
			
 
				+uint32_t _starpu_compute_buffers_footprint(struct starpu_job_s *j);
			
 
				 
			
 
				 /* Compute the footprint that characterizes the layout of the data handle. */
			
 
				 uint32_t _starpu_compute_data_footprint(starpu_data_handle handle);
			
--- a/src/datawizard/interfaces/bcsr_filters.c
+++ b/src/datawizard/interfaces/bcsr_filters.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,7 +20,7 @@
 
				 #include <common/config.h>
			
 
				 #include <datawizard/filters.h>
			
 
				 
			
 
				-void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, __attribute__((unused)) unsigned nparts)
			
 
				+void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nparts)
			
 
				 {
			
 
				 	struct starpu_bcsr_interface_s *bcsr_father = father_interface;
			
 
				 	/* each chunk becomes a small dense matrix */
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,15 +31,15 @@
 
				  * BCSR : blocked CSR, we use blocks of size (r x c)
			
 
				  */
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #endif
			
 
				 
			
 
				 static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
			
@@ -60,11 +60,11 @@ static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
			
 
				-static ssize_t allocate_bcsr_buffer_on_node(void *interface, uint32_t dst_node);
			
 
				-static void free_bcsr_buffer_on_node(void *interface, uint32_t node);
			
 
				+static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static ssize_t allocate_bcsr_buffer_on_node(void *data_interface, uint32_t dst_node);
			
 
				+static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node);
			
 
				 static size_t bcsr_interface_get_size(starpu_data_handle handle);
			
 
				-static int bcsr_compare(void *interface_a, void *interface_b);
			
 
				+static int bcsr_compare(void *data_interface_a, void *data_interface_b);
			
 
				 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle);
			
 
				 
			
 
				 
			
@@ -80,9 +80,9 @@ static struct starpu_data_interface_ops_t interface_bcsr_ops = {
 
				 	.compare = bcsr_compare
			
 
				 };
			
 
				 
			
 
				-static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
			
 
				+static void register_bcsr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *bcsr_interface = interface;
			
 
				+	starpu_bcsr_interface_t *bcsr_interface = data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
@@ -115,7 +115,7 @@ void starpu_bcsr_data_register(starpu_data_handle *handleptr, uint32_t home_node
 
				 		uint32_t *rowptr, uint32_t firstentry,
			
 
				 		uint32_t r, uint32_t c, size_t elemsize)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t interface = {
			
 
				+	starpu_bcsr_interface_t bcsr_interface = {
			
 
				 		.nzval = nzval,
			
 
				 		.colind = colind,
			
 
				 		.rowptr = rowptr,
			
@@ -127,7 +127,7 @@ void starpu_bcsr_data_register(starpu_data_handle *handleptr, uint32_t home_node
 
				 		.elemsize = elemsize
			
 
				 	};
			
 
				 
			
 
				-	starpu_data_register(handleptr, home_node, &interface, &interface_bcsr_ops);
			
 
				+	starpu_data_register(handleptr, home_node, &bcsr_interface, &interface_bcsr_ops);
			
 
				 }
			
 
				 
			
 
				 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle)
			
@@ -141,10 +141,10 @@ static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle handle)
 
				 	return hash;
			
 
				 }
			
 
				 
			
 
				-static int bcsr_compare(void *interface_a, void *interface_b)
			
 
				+static int bcsr_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *bcsr_a = interface_a;
			
 
				-	starpu_bcsr_interface_t *bcsr_b = interface_b;
			
 
				+	starpu_bcsr_interface_t *bcsr_a = data_interface_a;
			
 
				+	starpu_bcsr_interface_t *bcsr_b = data_interface_b;
			
 
				 
			
 
				 	/* Two matricess are considered compatible if they have the same size */
			
 
				 	return ((bcsr_a->nnz == bcsr_b->nnz)
			
@@ -157,50 +157,50 @@ static int bcsr_compare(void *interface_a, void *interface_b)
 
				 /* offer an access to the data parameters */
			
 
				 uint32_t starpu_bcsr_get_nnz(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->nnz;
			
 
				+	return data_interface->nnz;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_bcsr_get_nrow(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->nrow;
			
 
				+	return data_interface->nrow;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_bcsr_get_firstentry(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->firstentry;
			
 
				+	return data_interface->firstentry;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_bcsr_get_r(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->r;
			
 
				+	return data_interface->r;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_bcsr_get_c(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->c;
			
 
				+	return data_interface->c;
			
 
				 }
			
 
				 
			
 
				 size_t starpu_bcsr_get_elemsize(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->elemsize;
			
 
				+	return data_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				 uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle handle)
			
@@ -210,28 +210,28 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 	
			
 
				-	return interface->nzval;
			
 
				+	return data_interface->nzval;
			
 
				 }
			
 
				 
			
 
				 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle handle)
			
 
				 {
			
 
				 	/* XXX 0 */
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->colind;
			
 
				+	return data_interface->colind;
			
 
				 }
			
 
				 
			
 
				 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle handle)
			
 
				 {
			
 
				 	/* XXX 0 */
			
 
				-	starpu_bcsr_interface_t *interface =
			
 
				+	starpu_bcsr_interface_t *data_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->rowptr;
			
 
				+	return data_interface->rowptr;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -254,21 +254,21 @@ static size_t bcsr_interface_get_size(starpu_data_handle handle)
 
				 /* memory allocation/deallocation primitives for the BLAS interface */
			
 
				 
			
 
				 /* returns the size of the allocated area */
			
 
				-static ssize_t allocate_bcsr_buffer_on_node(void *interface_, uint32_t dst_node)
			
 
				+static ssize_t allocate_bcsr_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				 	uintptr_t addr_nzval;
			
 
				 	uint32_t *addr_colind, *addr_rowptr;
			
 
				 	ssize_t allocated_memory;
			
 
				 
			
 
				 	/* we need the 3 arrays to be allocated */
			
 
				-	starpu_bcsr_interface_t *interface = interface_;
			
 
				+	starpu_bcsr_interface_t *bcsr_interface = data_interface_;
			
 
				 
			
 
				-	uint32_t nnz = interface->nnz;
			
 
				-	uint32_t nrow = interface->nrow;
			
 
				-	size_t elemsize = interface->elemsize;
			
 
				+	uint32_t nnz = bcsr_interface->nnz;
			
 
				+	uint32_t nrow = bcsr_interface->nrow;
			
 
				+	size_t elemsize = bcsr_interface->elemsize;
			
 
				 
			
 
				-	uint32_t r = interface->r;
			
 
				-	uint32_t c = interface->c;
			
 
				+	uint32_t r = bcsr_interface->r;
			
 
				+	uint32_t c = bcsr_interface->c;
			
 
				 
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				 
			
@@ -333,9 +333,9 @@ static ssize_t allocate_bcsr_buffer_on_node(void *interface_, uint32_t dst_node)
 
				 		nnz*r*c*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
			
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				-	interface->nzval = addr_nzval;
			
 
				-	interface->colind = addr_colind;
			
 
				-	interface->rowptr = addr_rowptr;
			
 
				+	bcsr_interface->nzval = addr_nzval;
			
 
				+	bcsr_interface->colind = addr_colind;
			
 
				+	bcsr_interface->rowptr = addr_rowptr;
			
 
				 	
			
 
				 	return allocated_memory;
			
 
				 
			
@@ -381,9 +381,9 @@ fail_nzval:
 
				 	return -ENOMEM;
			
 
				 }
			
 
				 
			
 
				-static void free_bcsr_buffer_on_node(void *interface, uint32_t node)
			
 
				+static void free_bcsr_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *bcsr_interface = interface;	
			
 
				+	starpu_bcsr_interface_t *bcsr_interface = data_interface;
			
 
				 
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				 	switch(kind) {
			
@@ -412,7 +412,7 @@ static void free_bcsr_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
@@ -443,24 +443,24 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
@@ -491,7 +491,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
@@ -524,7 +524,7 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute
 
				 #endif // STARPU_USE_OPENCL
			
 
				 
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
--- a/src/datawizard/interfaces/block_filters.c
+++ b/src/datawizard/interfaces/block_filters.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,7 +18,7 @@
 
				 #include <common/config.h>
			
 
				 #include <datawizard/filters.h>
			
 
				 
			
 
				-void starpu_block_filter_func_block(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f,
			
 
				+void starpu_block_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
			
 
				                                     unsigned id, unsigned nparts)
			
 
				 {
			
 
				         starpu_block_interface_t *block_father = father_interface;
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -27,19 +27,19 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				 #endif
			
 
				 
			
 
				 static const struct starpu_data_copy_methods block_copy_data_methods_s = {
			
@@ -65,20 +65,22 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
 
				 };
			
 
				 
			
 
				 
			
 
				-static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
			
 
				-static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node);
			
 
				-static void free_block_buffer_on_node(void *interface, uint32_t node);
			
 
				+static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static void *block_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
			
 
				+static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				+static void free_block_buffer_on_node(void *data_interface, uint32_t node);
			
 
				 static size_t block_interface_get_size(starpu_data_handle handle);
			
 
				 static uint32_t footprint_block_interface_crc32(starpu_data_handle handle);
			
 
				-static int block_compare(void *interface_a, void *interface_b);
			
 
				+static int block_compare(void *data_interface_a, void *data_interface_b);
			
 
				 static void display_block_interface(starpu_data_handle handle, FILE *f);
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss);
			
 
				+static int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
			
 
				 #endif
			
 
				 
			
 
				 static struct starpu_data_interface_ops_t interface_block_ops = {
			
 
				 	.register_data_handle = register_block_handle,
			
 
				 	.allocate_data_on_node = allocate_block_buffer_on_node,
			
 
				+	.handle_to_pointer = block_handle_to_pointer,
			
 
				 	.free_data_on_node = free_block_buffer_on_node,
			
 
				 	.copy_methods = &block_copy_data_methods_s,
			
 
				 	.get_size = block_interface_get_size,
			
@@ -93,7 +95,7 @@ static struct starpu_data_interface_ops_t interface_block_ops = {
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				 {
			
 
				 	/* TODO */
			
 
				 	STARPU_ABORT();
			
@@ -102,9 +104,19 @@ int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
			
 
				+static void *block_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface = interface;
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+
			
 
				+	starpu_block_interface_t *block_interface =
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+	return (void*) block_interface->ptr;
			
 
				+}
			
 
				+
			
 
				+static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				+{
			
 
				+	starpu_block_interface_t *block_interface = data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
@@ -139,7 +151,7 @@ void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_nod
 
				 			uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
			
 
				 			uint32_t ny, uint32_t nz, size_t elemsize)
			
 
				 {
			
 
				-	starpu_block_interface_t interface = {
			
 
				+	starpu_block_interface_t block_interface = {
			
 
				 		.ptr = ptr,
			
 
				                 .dev_handle = ptr,
			
 
				                 .offset = 0,
			
@@ -151,7 +163,7 @@ void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_nod
 
				 		.elemsize = elemsize
			
 
				 	};
			
 
				 
			
 
				-	starpu_data_register(handleptr, home_node, &interface, &interface_block_ops);
			
 
				+	starpu_data_register(handleptr, home_node, &block_interface, &interface_block_ops);
			
 
				 }
			
 
				 
			
 
				 static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
			
@@ -165,10 +177,10 @@ static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
 
				 	return hash;
			
 
				 }
			
 
				 
			
 
				-static int block_compare(void *interface_a, void *interface_b)
			
 
				+static int block_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_a = interface_a;
			
 
				-	starpu_block_interface_t *block_b = interface_b;
			
 
				+	starpu_block_interface_t *block_a = data_interface_a;
			
 
				+	starpu_block_interface_t *block_b = data_interface_b;
			
 
				 
			
 
				 	/* Two matricess are considered compatible if they have the same size */
			
 
				 	return ((block_a->nx == block_b->nx)
			
@@ -179,21 +191,21 @@ static int block_compare(void *interface_a, void *interface_b)
 
				 
			
 
				 static void display_block_interface(starpu_data_handle handle, FILE *f)
			
 
				 {
			
 
				-	starpu_block_interface_t *interface;
			
 
				+	starpu_block_interface_t *block_interface;
			
 
				 
			
 
				-	interface = starpu_data_get_interface_on_node(handle, 0);
			
 
				+	block_interface = starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	fprintf(f, "%u\t%u\t%u\t", interface->nx, interface->ny, interface->nz);
			
 
				+	fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
			
 
				 }
			
 
				 
			
 
				 static size_t block_interface_get_size(starpu_data_handle handle)
			
 
				 {
			
 
				 	size_t size;
			
 
				-	starpu_block_interface_t *interface;
			
 
				+	starpu_block_interface_t *block_interface;
			
 
				 
			
 
				-	interface = starpu_data_get_interface_on_node(handle, 0);
			
 
				+	block_interface = starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	size = interface->nx*interface->ny*interface->nz*interface->elemsize; 
			
 
				+	size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize; 
			
 
				 
			
 
				 	return size;
			
 
				 }
			
@@ -201,26 +213,26 @@ static size_t block_interface_get_size(starpu_data_handle handle)
 
				 /* offer an access to the data parameters */
			
 
				 uint32_t starpu_block_get_nx(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_block_interface_t *interface =
			
 
				+	starpu_block_interface_t *block_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->nx;
			
 
				+	return block_interface->nx;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_block_get_ny(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_block_interface_t *interface =
			
 
				+	starpu_block_interface_t *block_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->ny;
			
 
				+	return block_interface->ny;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_block_get_nz(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_block_interface_t *interface =
			
 
				+	starpu_block_interface_t *block_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->nz;
			
 
				+	return block_interface->nz;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
			
@@ -230,10 +242,10 @@ uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 	
			
 
				-	starpu_block_interface_t *interface =
			
 
				+	starpu_block_interface_t *block_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->ldy;
			
 
				+	return block_interface->ldy;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
			
@@ -243,10 +255,10 @@ uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_block_interface_t *interface =
			
 
				+	starpu_block_interface_t *block_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->ldz;
			
 
				+	return block_interface->ldz;
			
 
				 }
			
 
				 
			
 
				 uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
			
@@ -256,25 +268,25 @@ uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_block_interface_t *interface =
			
 
				+	starpu_block_interface_t *block_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->ptr;
			
 
				+	return block_interface->ptr;
			
 
				 }
			
 
				 
			
 
				 size_t starpu_block_get_elemsize(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_block_interface_t *interface =
			
 
				+	starpu_block_interface_t *block_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->elemsize;
			
 
				+	return block_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				 
			
 
				 /* memory allocation/deallocation primitives for the BLOCK interface */
			
 
				 
			
 
				 /* returns the size of the allocated area */
			
 
				-static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node)
			
 
				+static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				 	uintptr_t addr = 0;
			
 
				 	unsigned fail = 0;
			
@@ -283,7 +295,7 @@ static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				-	starpu_block_interface_t *dst_block = interface_;
			
 
				+	starpu_block_interface_t *dst_block = data_interface_;
			
 
				 
			
 
				 	uint32_t nx = dst_block->nx;
			
 
				 	uint32_t ny = dst_block->ny;
			
@@ -350,9 +362,9 @@ static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-static void free_block_buffer_on_node(void *interface, uint32_t node)
			
 
				+static void free_block_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_block_interface_t *block_interface = interface;
			
 
				+	starpu_block_interface_t *block_interface = data_interface;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t status;
			
@@ -382,7 +394,7 @@ static void free_block_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block = src_interface;
			
 
				 	starpu_block_interface_t *dst_block = dst_interface;
			
@@ -435,7 +447,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block = src_interface;
			
 
				 	starpu_block_interface_t *dst_block = dst_interface;
			
@@ -547,29 +559,29 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_in
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block = src_interface;
			
 
				 	starpu_block_interface_t *dst_block = dst_interface;
			
@@ -636,7 +648,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block = src_interface;
			
 
				 	starpu_block_interface_t *dst_block = dst_interface;
			
@@ -695,12 +707,12 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
@@ -708,7 +720,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute
 
				 #endif
			
 
				 
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block = src_interface;
			
 
				 	starpu_block_interface_t *dst_block = dst_interface;
			
--- a/src/datawizard/interfaces/csr_filters.c
+++ b/src/datawizard/interfaces/csr_filters.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,7 +20,7 @@
 
				 #include <common/config.h>
			
 
				 #include <datawizard/filters.h>
			
 
				 
			
 
				-void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				+void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				 {
			
 
				 	starpu_csr_interface_t *csr_father = father_interface;
			
 
				 	starpu_csr_interface_t *csr_child = child_interface;
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,15 +28,18 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 #endif
			
 
				 
			
 
				 static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
			
@@ -46,6 +49,9 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				+	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -57,11 +63,11 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
			
 
				-static ssize_t allocate_csr_buffer_on_node(void *interface_, uint32_t dst_node);
			
 
				-static void free_csr_buffer_on_node(void *interface, uint32_t node);
			
 
				+static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				+static void free_csr_buffer_on_node(void *data_interface, uint32_t node);
			
 
				 static size_t csr_interface_get_size(starpu_data_handle handle);
			
 
				-static int csr_compare(void *interface_a, void *interface_b);
			
 
				+static int csr_compare(void *data_interface_a, void *data_interface_b);
			
 
				 static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle);
			
 
				 
			
 
				 static struct starpu_data_interface_ops_t interface_csr_ops = {
			
@@ -76,9 +82,9 @@ static struct starpu_data_interface_ops_t interface_csr_ops = {
 
				 	.compare = csr_compare
			
 
				 };
			
 
				 
			
 
				-static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
			
 
				+static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_interface = interface;
			
 
				+	starpu_csr_interface_t *csr_interface = data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
@@ -108,7 +114,7 @@ static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, v
 
				 void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				 		uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
			
 
				 {
			
 
				-	starpu_csr_interface_t interface = {
			
 
				+	starpu_csr_interface_t csr_interface = {
			
 
				 		.nnz = nnz,
			
 
				 		.nrow = nrow,
			
 
				 		.nzval = nzval,
			
@@ -118,7 +124,7 @@ void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
 
				 		.elemsize = elemsize
			
 
				 	};
			
 
				 
			
 
				-	starpu_data_register(handleptr, home_node, &interface, &interface_csr_ops);
			
 
				+	starpu_data_register(handleptr, home_node, &csr_interface, &interface_csr_ops);
			
 
				 }
			
 
				 
			
 
				 static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle)
			
@@ -126,10 +132,10 @@ static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle)
 
				 	return _starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
			
 
				 }
			
 
				 
			
 
				-static int csr_compare(void *interface_a, void *interface_b)
			
 
				+static int csr_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_a = interface_a;
			
 
				-	starpu_csr_interface_t *csr_b = interface_b;
			
 
				+	starpu_csr_interface_t *csr_a = data_interface_a;
			
 
				+	starpu_csr_interface_t *csr_b = data_interface_b;
			
 
				 
			
 
				 	/* Two matricess are considered compatible if they have the same size */
			
 
				 	return ((csr_a->nnz == csr_b->nnz)
			
@@ -140,34 +146,34 @@ static int csr_compare(void *interface_a, void *interface_b)
 
				 /* offer an access to the data parameters */
			
 
				 uint32_t starpu_csr_get_nnz(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_csr_interface_t *interface =
			
 
				+	starpu_csr_interface_t *csr_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->nnz;
			
 
				+	return csr_interface->nnz;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_csr_get_nrow(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_csr_interface_t *interface =
			
 
				+	starpu_csr_interface_t *csr_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->nrow;
			
 
				+	return csr_interface->nrow;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_csr_get_firstentry(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_csr_interface_t *interface =
			
 
				+	starpu_csr_interface_t *csr_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->firstentry;
			
 
				+	return csr_interface->firstentry;
			
 
				 }
			
 
				 
			
 
				 size_t starpu_csr_get_elemsize(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_csr_interface_t *interface =
			
 
				+	starpu_csr_interface_t *csr_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->elemsize;
			
 
				+	return csr_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				 uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle)
			
@@ -177,10 +183,10 @@ uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_csr_interface_t *interface =
			
 
				+	starpu_csr_interface_t *csr_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->nzval;
			
 
				+	return csr_interface->nzval;
			
 
				 }
			
 
				 
			
 
				 uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle)
			
@@ -190,10 +196,10 @@ uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_csr_interface_t *interface =
			
 
				+	starpu_csr_interface_t *csr_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->colind;
			
 
				+	return csr_interface->colind;
			
 
				 }
			
 
				 
			
 
				 uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle)
			
@@ -203,10 +209,10 @@ uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_csr_interface_t *interface =
			
 
				+	starpu_csr_interface_t *csr_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->rowptr;
			
 
				+	return csr_interface->rowptr;
			
 
				 }
			
 
				 
			
 
				 static size_t csr_interface_get_size(starpu_data_handle handle)
			
@@ -225,18 +231,18 @@ static size_t csr_interface_get_size(starpu_data_handle handle)
 
				 /* memory allocation/deallocation primitives for the BLAS interface */
			
 
				 
			
 
				 /* returns the size of the allocated area */
			
 
				-static ssize_t allocate_csr_buffer_on_node(void *interface_, uint32_t dst_node)
			
 
				+static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				 	uintptr_t addr_nzval;
			
 
				 	uint32_t *addr_colind, *addr_rowptr;
			
 
				 	ssize_t allocated_memory;
			
 
				 
			
 
				 	/* we need the 3 arrays to be allocated */
			
 
				-	starpu_csr_interface_t *interface = interface_;
			
 
				+	starpu_csr_interface_t *csr_interface = data_interface_;
			
 
				 
			
 
				-	uint32_t nnz = interface->nnz;
			
 
				-	uint32_t nrow = interface->nrow;
			
 
				-	size_t elemsize = interface->elemsize;
			
 
				+	uint32_t nnz = csr_interface->nnz;
			
 
				+	uint32_t nrow = csr_interface->nrow;
			
 
				+	size_t elemsize = csr_interface->elemsize;
			
 
				 
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				 
			
@@ -301,9 +307,9 @@ static ssize_t allocate_csr_buffer_on_node(void *interface_, uint32_t dst_node)
 
				 		nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
			
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				-	interface->nzval = addr_nzval;
			
 
				-	interface->colind = addr_colind;
			
 
				-	interface->rowptr = addr_rowptr;
			
 
				+	csr_interface->nzval = addr_nzval;
			
 
				+	csr_interface->colind = addr_colind;
			
 
				+	csr_interface->rowptr = addr_rowptr;
			
 
				 	
			
 
				 	return allocated_memory;
			
 
				 
			
@@ -349,9 +355,9 @@ fail_nzval:
 
				 	return -ENOMEM;
			
 
				 }
			
 
				 
			
 
				-static void free_csr_buffer_on_node(void *interface, uint32_t node)
			
 
				+static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_csr_interface_t *csr_interface = interface;	
			
 
				+	starpu_csr_interface_t *csr_interface = data_interface;
			
 
				 
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				 	switch(kind) {
			
@@ -380,7 +386,7 @@ static void free_csr_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_csr_interface_t *src_csr = src_interface;
			
 
				 	starpu_csr_interface_t *dst_csr = dst_interface;
			
@@ -408,24 +414,208 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				+{
			
 
				+	starpu_csr_interface_t *src_csr = src_interface;
			
 
				+	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	int synchronous_fallback = 0;
			
 
				+
			
 
				+	cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
			
 
				+	if (cures)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	if (!synchronous_fallback)
			
 
				+	{
			
 
				+		cures = cudaMemcpyAsync((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind, stream);
			
 
				+	}
			
 
				+
			
 
				+	if (synchronous_fallback || cures != cudaSuccess)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	if (!synchronous_fallback)
			
 
				+	{
			
 
				+		cures = cudaMemcpyAsync((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind, stream);
			
 
				+	}
			
 
				+
			
 
				+	if (synchronous_fallback || cures != cudaSuccess)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+	
			
 
				+	if (synchronous_fallback)
			
 
				+	{
			
 
				+		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+		return 0;
			
 
				+	}
			
 
				+	else {
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_peer(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	starpu_csr_interface_t *src_csr = src_interface;
			
 
				+	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	return 0;
			
 
				+#else
			
 
				+	STARPU_ABORT();
			
 
				+	return 0;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	starpu_csr_interface_t *src_csr = src_interface;
			
 
				+	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+	int synchronous_fallback = 0;
			
 
				+
			
 
				+	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
			
 
				+	if (cures)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	if (!synchronous_fallback)
			
 
				+	{
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
			
 
				+	}
			
 
				+
			
 
				+	if (synchronous_fallback || cures != cudaSuccess)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	if (!synchronous_fallback)
			
 
				+	{
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
			
 
				+	}
			
 
				+
			
 
				+	if (synchronous_fallback || cures != cudaSuccess)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+	
			
 
				+	if (synchronous_fallback)
			
 
				+	{
			
 
				+		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+		return 0;
			
 
				+	}
			
 
				+	else {
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+#else
			
 
				+	/* Illegal without Peer tranfers */
			
 
				+	STARPU_ABORT();
			
 
				+	return 0;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	else
			
 
				+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node);
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				 {
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	if (src_node == dst_node)
			
 
				+		return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
			
 
				+	else
			
 
				+		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 }
			
 
				+
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_csr_interface_t *src_csr = src_interface;
			
 
				 	starpu_csr_interface_t *dst_csr = dst_interface;
			
@@ -453,7 +643,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_csr_interface_t *src_csr = src_interface;
			
 
				 	starpu_csr_interface_t *dst_csr = dst_interface;
			
@@ -483,7 +673,7 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute
 
				 #endif // STARPU_USE_OPENCL
			
 
				 
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_csr_interface_t *src_csr = src_interface;
			
 
				 	starpu_csr_interface_t *dst_csr = dst_interface;
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -15,8 +15,79 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <stdint.h>
			
 
				+
			
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				+#include <common/uthash.h>
			
 
				+#include <common/starpu_spinlock.h>
			
 
				+
			
 
				+/* Entry in the `registered_handles' hash table.  */
			
 
				+struct handle_entry
			
 
				+{
			
 
				+	UT_hash_handle hh;
			
 
				+	void *pointer;
			
 
				+	starpu_data_handle handle;
			
 
				+};
			
 
				+
			
 
				+/* Hash table mapping host pointers to data handles.  */
			
 
				+static struct handle_entry *registered_handles;
			
 
				+static starpu_spinlock_t    registered_handles_lock;
			
 
				+
			
 
				+void _starpu_data_interface_init()
			
 
				+{
			
 
				+	_starpu_spin_init(&registered_handles_lock);
			
 
				+}
			
 
				+
			
 
				+void _starpu_data_interface_shutdown()
			
 
				+{
			
 
				+	struct handle_entry *entry, *tmp;
			
 
				+
			
 
				+	_starpu_spin_destroy(&registered_handles_lock);
			
 
				+
			
 
				+	HASH_ITER(hh, registered_handles, entry, tmp) {
			
 
				+		HASH_DEL(registered_handles, entry);
			
 
				+		free(entry);
			
 
				+	}
			
 
				+
			
 
				+	registered_handles = NULL;
			
 
				+}
			
 
				+
			
 
				+/* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
			
 
				+ * some handle, the new mapping shadows the previous one.   */
			
 
				+void _starpu_data_register_ram_pointer(starpu_data_handle handle, void *ptr)
			
 
				+{
			
 
				+	struct handle_entry *entry;
			
 
				+
			
 
				+	entry = malloc(sizeof(*entry));
			
 
				+	STARPU_ASSERT(entry != NULL);
			
 
				+
			
 
				+	entry->pointer = ptr;
			
 
				+	entry->handle = handle;
			
 
				+
			
 
				+	_starpu_spin_lock(&registered_handles_lock);
			
 
				+	HASH_ADD_PTR(registered_handles, pointer, entry);
			
 
				+	_starpu_spin_unlock(&registered_handles_lock);
			
 
				+}
			
 
				+
			
 
				+starpu_data_handle starpu_data_lookup(const void *ptr)
			
 
				+{
			
 
				+	starpu_data_handle result;
			
 
				+
			
 
				+	_starpu_spin_lock(&registered_handles_lock);
			
 
				+	{
			
 
				+		struct handle_entry *entry;
			
 
				+
			
 
				+		HASH_FIND_PTR(registered_handles, &ptr, entry);
			
 
				+		if(STARPU_UNLIKELY(entry == NULL))
			
 
				+			result = NULL;
			
 
				+		else
			
 
				+			result = entry->handle;
			
 
				+	}
			
 
				+	_starpu_spin_unlock(&registered_handles_lock);
			
 
				+
			
 
				+	return result;
			
 
				+}
			
 
				 
			
 
				 /* 
			
 
				  * Start monitoring a piece of data
			
@@ -25,6 +96,8 @@
 
				 static void _starpu_register_new_data(starpu_data_handle handle,
			
 
				 					uint32_t home_node, uint32_t wt_mask)
			
 
				 {
			
 
				+	void *ptr;
			
 
				+
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				 	/* initialize the new lock */
			
@@ -42,6 +115,7 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 	handle->sibling_index = 0; /* could be anything for the root */
			
 
				 	handle->depth = 1; /* the tree is just a node yet */
			
 
				         handle->rank = -1; /* invalid until set */
			
 
				+	handle->tag = -1; /* invalid until set */
			
 
				 
			
 
				 	handle->is_not_important = 0;
			
 
				 
			
@@ -113,18 +187,29 @@ static void _starpu_register_new_data(starpu_data_handle handle,
 
				 		replicate->state = STARPU_INVALID;
			
 
				 		replicate->refcnt = 0;
			
 
				 		replicate->handle = handle;
			
 
				-		replicate->requested = 0;
			
 
				-		replicate->request = NULL;
			
 
				+
			
 
				+		for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+		{
			
 
				+			replicate->requested[node] = 0;
			
 
				+			replicate->request[node] = NULL;
			
 
				+		}
			
 
				+
			
 
				 		replicate->relaxed_coherency = 1;
			
 
				 		replicate->initialized = 0;
			
 
				 		replicate->memory_node = starpu_worker_get_memory_node(worker);
			
 
				 
			
 
				 		/* duplicate  the content of the interface on node 0 */
			
 
				 		memcpy(replicate->data_interface, handle->per_node[0].data_interface, handle->ops->interface_size);
			
 
				-	} 
			
 
				+	}
			
 
				 
			
 
				 	/* now the data is available ! */
			
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				+
			
 
				+	ptr = starpu_handle_to_pointer(handle, 0);
			
 
				+	if (ptr != NULL)
			
 
				+	{
			
 
				+		_starpu_data_register_ram_pointer(handle, ptr);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interface_ops_t *interface_ops)
			
@@ -169,7 +254,7 @@ static starpu_data_handle _starpu_data_handle_allocate(struct starpu_data_interf
 
				 }
			
 
				 
			
 
				 void starpu_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				-				void *interface,
			
 
				+				void *data_interface,
			
 
				 				struct starpu_data_interface_ops_t *ops)
			
 
				 {
			
 
				 	starpu_data_handle handle =
			
@@ -180,11 +265,30 @@ void starpu_data_register(starpu_data_handle *handleptr, uint32_t home_node,
 
				 
			
 
				 
			
 
				 	/* fill the interface fields with the appropriate method */
			
 
				-	ops->register_data_handle(handle, home_node, interface);
			
 
				+	ops->register_data_handle(handle, home_node, data_interface);
			
 
				 
			
 
				 	_starpu_register_new_data(handle, home_node, 0);
			
 
				 }
			
 
				 
			
 
				+void *starpu_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				+{
			
 
				+	/* Check whether the operation is supported and the node has actually
			
 
				+	 * been allocated.  */
			
 
				+	if (handle->ops->handle_to_pointer
			
 
				+	    && starpu_data_test_if_allocated_on_node(handle, node))
			
 
				+	{
			
 
				+		return handle->ops->handle_to_pointer(handle, node);
			
 
				+	}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+void *starpu_handle_get_local_ptr(starpu_data_handle handle)
			
 
				+{
			
 
				+	return starpu_handle_to_pointer(handle,
			
 
				+					_starpu_get_local_memory_node());
			
 
				+}
			
 
				+
			
 
				 int starpu_data_get_rank(starpu_data_handle handle)
			
 
				 {
			
 
				 	return handle->rank;
			
@@ -196,21 +300,52 @@ int starpu_data_set_rank(starpu_data_handle handle, int rank)
 
				         return 0;
			
 
				 }
			
 
				 
			
 
				+int starpu_data_get_tag(starpu_data_handle handle)
			
 
				+{
			
 
				+	return handle->tag;
			
 
				+}
			
 
				+
			
 
				+int starpu_data_set_tag(starpu_data_handle handle, int tag)
			
 
				+{
			
 
				+        handle->tag = tag;
			
 
				+        return 0;
			
 
				+}
			
 
				+
			
 
				 /* 
			
 
				  * Stop monitoring a piece of data
			
 
				  */
			
 
				 
			
 
				 void _starpu_data_free_interfaces(starpu_data_handle handle)
			
 
				 {
			
 
				+	const void *ram_ptr;
			
 
				 	unsigned node;
			
 
				 	unsigned worker;
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 
			
 
				+	ram_ptr = starpu_handle_to_pointer(handle, 0);
			
 
				+
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 		free(handle->per_node[node].data_interface);
			
 
				 
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 		free(handle->per_worker[worker].data_interface);
			
 
				+
			
 
				+	if (ram_ptr != NULL)
			
 
				+	{
			
 
				+		/* Remove the PTR -> HANDLE mapping.  If a mapping from PTR
			
 
				+		 * to another handle existed before (e.g., when using
			
 
				+		 * filters), it becomes visible again.  */
			
 
				+		struct handle_entry *entry;
			
 
				+
			
 
				+		_starpu_spin_lock(&registered_handles_lock);
			
 
				+		HASH_FIND_PTR(registered_handles, &ram_ptr, entry);
			
 
				+		STARPU_ASSERT(entry != NULL);
			
 
				+
			
 
				+		HASH_DEL(registered_handles, entry);
			
 
				+		free(entry);
			
 
				+
			
 
				+		_starpu_spin_unlock(&registered_handles_lock);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 struct unregister_callback_arg {
			
--- a/src/datawizard/interfaces/data_interface.h
+++ b/src/datawizard/interfaces/data_interface.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -23,6 +23,14 @@
 
				 
			
 
				 /* Some data interfaces or filters use this interface internally */
			
 
				 extern struct starpu_data_interface_ops_t _starpu_interface_matrix_ops;
			
 
				-void _starpu_data_free_interfaces(starpu_data_handle handle);
			
 
				+void _starpu_data_free_interfaces(starpu_data_handle handle)
			
 
				+	STARPU_ATTRIBUTE_INTERNAL;
			
 
				+
			
 
				+extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
			
 
				+extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
			
 
				+
			
 
				+extern void _starpu_data_register_ram_pointer(starpu_data_handle handle,
			
 
				+						void *ptr)
			
 
				+	STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
 
				 #endif // __DATA_INTERFACE_H__
			
--- a/src/datawizard/interfaces/matrix_filters.c
+++ b/src/datawizard/interfaces/matrix_filters.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,7 +23,7 @@
 
				 /*
			
 
				  * an example of a dummy partition function : blocks ...
			
 
				  */
			
 
				-void starpu_block_filter_func(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				+void starpu_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				 {
			
 
				        starpu_matrix_interface_t *matrix_father = father_interface;
			
 
				        starpu_matrix_interface_t *matrix_child = child_interface;
			
@@ -54,7 +54,7 @@ void starpu_block_filter_func(void *father_interface, void *child_interface, __a
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				+void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				 {
			
 
				         starpu_matrix_interface_t *matrix_father = father_interface;
			
 
				         starpu_matrix_interface_t *matrix_child = child_interface;
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,19 +25,20 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+//static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				 #endif
			
 
				 
			
 
				 static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
			
@@ -49,6 +50,7 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+//	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -62,20 +64,22 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
			
 
				-static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_node);
			
 
				-static void free_matrix_buffer_on_node(void *interface, uint32_t node);
			
 
				+static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static void *matrix_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
			
 
				+static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				+static void free_matrix_buffer_on_node(void *data_interface, uint32_t node);
			
 
				 static size_t matrix_interface_get_size(starpu_data_handle handle);
			
 
				 static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle);
			
 
				-static int matrix_compare(void *interface_a, void *interface_b);
			
 
				+static int matrix_compare(void *data_interface_a, void *data_interface_b);
			
 
				 static void display_matrix_interface(starpu_data_handle handle, FILE *f);
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_matrix_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				 #endif
			
 
				 
			
 
				 struct starpu_data_interface_ops_t _starpu_interface_matrix_ops = {
			
 
				 	.register_data_handle = register_matrix_handle,
			
 
				 	.allocate_data_on_node = allocate_matrix_buffer_on_node,
			
 
				+	.handle_to_pointer = matrix_handle_to_pointer,
			
 
				 	.free_data_on_node = free_matrix_buffer_on_node,
			
 
				 	.copy_methods = &matrix_copy_data_methods_s,
			
 
				 	.get_size = matrix_interface_get_size,
			
@@ -90,7 +94,7 @@ struct starpu_data_interface_ops_t _starpu_interface_matrix_ops = {
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_matrix_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+static int convert_matrix_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				 {
			
 
				 	size_t elemsize = GET_MATRIX_ELEMSIZE(interface);
			
 
				 	uint32_t nx = STARPU_MATRIX_GET_NX(interface);
			
@@ -107,9 +111,9 @@ static int convert_matrix_to_gordon(void *interface, uint64_t *ptr, gordon_strid
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
			
 
				+static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = interface;
			
 
				+	starpu_matrix_interface_t *matrix_interface = data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
@@ -136,12 +140,23 @@ static void register_matrix_handle(starpu_data_handle handle, uint32_t home_node
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void *matrix_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+
			
 
				+	starpu_matrix_interface_t *matrix_interface =
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+	return (void*) matrix_interface->ptr;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 /* declare a new data with the matrix interface */
			
 
				 void starpu_matrix_data_register(starpu_data_handle *handleptr, uint32_t home_node,
			
 
				 			uintptr_t ptr, uint32_t ld, uint32_t nx,
			
 
				 			uint32_t ny, size_t elemsize)
			
 
				 {
			
 
				-	starpu_matrix_interface_t interface = {
			
 
				+	starpu_matrix_interface_t matrix_interface = {
			
 
				 		.ptr = ptr,
			
 
				 		.ld = ld,
			
 
				 		.nx = nx,
			
@@ -151,7 +166,7 @@ void starpu_matrix_data_register(starpu_data_handle *handleptr, uint32_t home_no
 
				                 .offset = 0
			
 
				 	};
			
 
				 
			
 
				-	starpu_data_register(handleptr, home_node, &interface, &_starpu_interface_matrix_ops);
			
 
				+	starpu_data_register(handleptr, home_node, &matrix_interface, &_starpu_interface_matrix_ops);
			
 
				 }
			
 
				 
			
 
				 static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle)
			
@@ -159,10 +174,10 @@ static uint32_t footprint_matrix_interface_crc32(starpu_data_handle handle)
 
				 	return _starpu_crc32_be(starpu_matrix_get_nx(handle), starpu_matrix_get_ny(handle));
			
 
				 }
			
 
				 
			
 
				-static int matrix_compare(void *interface_a, void *interface_b)
			
 
				+static int matrix_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_a = interface_a;
			
 
				-	starpu_matrix_interface_t *matrix_b = interface_b;
			
 
				+	starpu_matrix_interface_t *matrix_a = data_interface_a;
			
 
				+	starpu_matrix_interface_t *matrix_b = data_interface_b;
			
 
				 
			
 
				 	/* Two matricess are considered compatible if they have the same size */
			
 
				 	return ((matrix_a->nx == matrix_b->nx)
			
@@ -172,19 +187,19 @@ static int matrix_compare(void *interface_a, void *interface_b)
 
				 
			
 
				 static void display_matrix_interface(starpu_data_handle handle, FILE *f)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *interface =
			
 
				+	starpu_matrix_interface_t *matrix_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	fprintf(f, "%u\t%u\t", interface->nx, interface->ny);
			
 
				+	fprintf(f, "%u\t%u\t", matrix_interface->nx, matrix_interface->ny);
			
 
				 }
			
 
				 
			
 
				 static size_t matrix_interface_get_size(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *interface =
			
 
				+	starpu_matrix_interface_t *matrix_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				 	size_t size;
			
 
				-	size = (size_t)interface->nx*interface->ny*interface->elemsize; 
			
 
				+	size = (size_t)matrix_interface->nx*matrix_interface->ny*matrix_interface->elemsize; 
			
 
				 
			
 
				 	return size;
			
 
				 }
			
@@ -192,18 +207,18 @@ static size_t matrix_interface_get_size(starpu_data_handle handle)
 
				 /* offer an access to the data parameters */
			
 
				 uint32_t starpu_matrix_get_nx(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *interface =
			
 
				+	starpu_matrix_interface_t *matrix_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->nx;
			
 
				+	return matrix_interface->nx;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_matrix_get_ny(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *interface =
			
 
				+	starpu_matrix_interface_t *matrix_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->ny;
			
 
				+	return matrix_interface->ny;
			
 
				 }
			
 
				 
			
 
				 uint32_t starpu_matrix_get_local_ld(starpu_data_handle handle)
			
@@ -213,10 +228,10 @@ uint32_t starpu_matrix_get_local_ld(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_matrix_interface_t *interface =
			
 
				+	starpu_matrix_interface_t *matrix_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->ld;
			
 
				+	return matrix_interface->ld;
			
 
				 }
			
 
				 
			
 
				 uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle handle)
			
@@ -226,24 +241,24 @@ uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_matrix_interface_t *interface =
			
 
				+	starpu_matrix_interface_t *matrix_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->ptr;
			
 
				+	return matrix_interface->ptr;
			
 
				 }
			
 
				 
			
 
				 size_t starpu_matrix_get_elemsize(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *interface =
			
 
				+	starpu_matrix_interface_t *matrix_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->elemsize;
			
 
				+	return matrix_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				 /* memory allocation/deallocation primitives for the matrix interface */
			
 
				 
			
 
				 /* returns the size of the allocated area */
			
 
				-static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_node)
			
 
				+static ssize_t allocate_matrix_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				 	uintptr_t addr = 0;
			
 
				 	unsigned fail = 0;
			
@@ -253,12 +268,12 @@ static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_nod
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				 
			
 
				-	starpu_matrix_interface_t *interface = interface_;
			
 
				+	starpu_matrix_interface_t *matrix_interface = data_interface_;
			
 
				 
			
 
				-	uint32_t nx = interface->nx;
			
 
				-	uint32_t ny = interface->ny;
			
 
				+	uint32_t nx = matrix_interface->nx;
			
 
				+	uint32_t ny = matrix_interface->ny;
			
 
				 	uint32_t ld = nx; // by default
			
 
				-	size_t elemsize = interface->elemsize;
			
 
				+	size_t elemsize = matrix_interface->elemsize;
			
 
				 
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				 
			
@@ -306,10 +321,10 @@ static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_nod
 
				 		allocated_memory = (size_t)nx*ny*elemsize;
			
 
				 
			
 
				 		/* update the data properly in consequence */
			
 
				-		interface->ptr = addr;
			
 
				-                interface->dev_handle = addr;
			
 
				-                interface->offset = 0;
			
 
				-		interface->ld = ld;
			
 
				+		matrix_interface->ptr = addr;
			
 
				+                matrix_interface->dev_handle = addr;
			
 
				+                matrix_interface->offset = 0;
			
 
				+		matrix_interface->ld = ld;
			
 
				 	} else {
			
 
				 		/* allocation failed */
			
 
				 		allocated_memory = -ENOMEM;
			
@@ -318,9 +333,9 @@ static ssize_t allocate_matrix_buffer_on_node(void *interface_, uint32_t dst_nod
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-static void free_matrix_buffer_on_node(void *interface, uint32_t node)
			
 
				+static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *matrix_interface = interface;
			
 
				+	starpu_matrix_interface_t *matrix_interface = data_interface;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaError_t status;
			
@@ -350,17 +365,48 @@ static void free_matrix_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, int is_async, cudaStream_t stream)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				 
			
 
				 	size_t elemsize = src_matrix->elemsize;
			
 
				-
			
 
				 	cudaError_t cures;
			
 
				-	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				+
			
 
				+#if 0
			
 
				+
			
 
				+	struct cudaMemcpy3DParms p;
			
 
				+	memset(&p, 0, sizeof(p));
			
 
				+
			
 
				+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * src_matrix->ny *elemsize, src_matrix->ny);
			
 
				+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * src_matrix->ny *elemsize, dst_matrix->ny);
			
 
				+	p.extent = make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
			
 
				+	p.kind = kind;
			
 
				+
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		cures = cudaMemcpy3DAsync(&p, stream);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	cures = cudaMemcpy3D(&p);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+#endif
			
 
				+
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-			src_matrix->nx*elemsize, src_matrix->ny, kind);
			
 
				+			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				+		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				+		src_matrix->nx*elemsize, src_matrix->ny, kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
@@ -369,85 +415,130 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+/* XXX this is broken : we need to find a way to fix that ! */
			
 
				+#if 0
			
 
				+static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				 
			
 
				 	size_t elemsize = src_matrix->elemsize;
			
 
				+	cudaError_t cures;
			
 
				 
			
 
				-	cudaError_t cures;	
			
 
				-	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
			
 
				-			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
			
 
				-			cudaMemcpyDeviceToHost, stream);
			
 
				-	if (cures)
			
 
				-	{
			
 
				-		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
			
 
				-			(size_t)src_matrix->nx*elemsize, (size_t)src_matrix->ny,
			
 
				-			cudaMemcpyDeviceToHost);
			
 
				+#if 1
			
 
				+	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				 
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
			
 
				 
			
 
				-		return 0;
			
 
				+	cures = cudaSetDevice(src_dev);
			
 
				+	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+
			
 
				+	struct cudaPitchedPtr mem_device1;
			
 
				+	cures = cudaMalloc3D(&mem_device1, extent);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	cures = cudaSetDevice(dst_dev);
			
 
				+	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+
			
 
				+	struct cudaPitchedPtr mem_device2;
			
 
				+	cures = cudaMalloc3D(&mem_device2, extent);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	struct cudaMemcpy3DPeerParms p;
			
 
				+	memset(&p, 0, sizeof(p));
			
 
				+	p.srcDevice = src_dev;
			
 
				+	p.dstDevice = dst_dev;
			
 
				+	p.srcPtr = mem_device1;
			
 
				+	p.dstPtr = mem_device2;
			
 
				+	p.extent = extent;
			
 
				+
			
 
				+	cures = cudaMemcpy3DPeer(&p);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+
			
 
				+//make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
			
 
				+//make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, src_matrix->nx, dst_matrix->ny);
			
 
				+//make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
			
 
				+
			
 
				+//	if (is_async)
			
 
				+//	{
			
 
				+//		cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				+//		if (!cures)
			
 
				+//			return -EAGAIN;
			
 
				+//	}
			
 
				+
			
 
				+#else
			
 
				+	/* XXX FIXME !!*/
			
 
				+	STARPU_ASSERT(src_matrix->nx == src_matrix->ld);
			
 
				+	STARPU_ASSERT(dst_matrix->nx == dst_matrix->ld);
			
 
				+
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				 	}
			
 
				 
			
 
				+	cures = cudaMemcpyPeer((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+#endif
			
 
				+
			
 
				 	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				 
			
 
				-	return -EAGAIN;
			
 
				+	return 0;
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				-	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 0, 0);
			
 
				+}
			
 
				 
			
 
				-	size_t elemsize = src_matrix->elemsize;
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 0, 0);
			
 
				+}
			
 
				 
			
 
				-	cudaError_t cures;
			
 
				-	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-				src_matrix->nx*elemsize, src_matrix->ny,
			
 
				-				cudaMemcpyHostToDevice, stream);
			
 
				-	if (cures)
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
			
 
				+	else
			
 
				 	{
			
 
				-		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-				src_matrix->nx*elemsize, src_matrix->ny, cudaMemcpyHostToDevice);
			
 
				-
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				+		/* XXX not implemented */
			
 
				+		STARPU_ABORT();
			
 
				 		return 0;
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 1, stream);
			
 
				+}
			
 
				 
			
 
				-	return -EAGAIN;
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
			
 
				 }
			
 
				 
			
 
				+#if 0
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 1, stream);
			
 
				+	else
			
 
				+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
			
 
				+}
			
 
				+#endif
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
@@ -467,7 +558,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
@@ -488,12 +579,12 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
@@ -501,7 +592,7 @@ static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute
 
				 #endif
			
 
				 
			
 
				 /* as not all platform easily have a  lib installed ... */
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,20 +25,21 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
			
 
				 #endif
			
 
				 
			
 
				 static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
			
@@ -47,9 +48,10 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -64,20 +66,22 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
			
 
				-static ssize_t allocate_variable_buffer_on_node(void *interface_, uint32_t dst_node);
			
 
				-static void free_variable_buffer_on_node(void *interface, uint32_t node);
			
 
				+static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				+static void *variable_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
			
 
				+static void free_variable_buffer_on_node(void *data_interface, uint32_t node);
			
 
				 static size_t variable_interface_get_size(starpu_data_handle handle);
			
 
				 static uint32_t footprint_variable_interface_crc32(starpu_data_handle handle);
			
 
				-static int variable_compare(void *interface_a, void *interface_b);
			
 
				+static int variable_compare(void *data_interface_a, void *data_interface_b);
			
 
				 static void display_variable_interface(starpu_data_handle handle, FILE *f);
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_variable_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+static int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				 #endif
			
 
				 
			
 
				 static struct starpu_data_interface_ops_t interface_variable_ops = {
			
 
				 	.register_data_handle = register_variable_handle,
			
 
				 	.allocate_data_on_node = allocate_variable_buffer_on_node,
			
 
				+	.handle_to_pointer = variable_handle_to_pointer,
			
 
				 	.free_data_on_node = free_variable_buffer_on_node,
			
 
				 	.copy_methods = &variable_copy_data_methods_s,
			
 
				 	.get_size = variable_interface_get_size,
			
@@ -91,7 +95,14 @@ static struct starpu_data_interface_ops_t interface_variable_ops = {
 
				 	.display = display_variable_interface
			
 
				 };
			
 
				 
			
 
				-static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
			
 
				+static void *variable_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				+{
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+
			
 
				+	return (void*) STARPU_VARIABLE_GET_PTR(starpu_data_get_interface_on_node(handle, node));
			
 
				+}
			
 
				+
			
 
				+static void register_variable_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				 {
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
@@ -100,18 +111,18 @@ static void register_variable_handle(starpu_data_handle handle, uint32_t home_no
 
				 			starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 		if (node == home_node) {
			
 
				-			local_interface->ptr = STARPU_VARIABLE_GET_PTR(interface);
			
 
				+			local_interface->ptr = STARPU_VARIABLE_GET_PTR(data_interface);
			
 
				 		}
			
 
				 		else {
			
 
				 			local_interface->ptr = 0;
			
 
				 		}
			
 
				 
			
 
				-		local_interface->elemsize = STARPU_VARIABLE_GET_ELEMSIZE(interface);
			
 
				+		local_interface->elemsize = STARPU_VARIABLE_GET_ELEMSIZE(data_interface);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-int convert_variable_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+int convert_variable_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				 {
			
 
				 	*ptr = STARPU_VARIABLE_GET_PTR(interface);
			
 
				 	(*ss).size = STARPU_VARIABLE_GET_ELEMSIZE(interface);
			
@@ -138,10 +149,10 @@ static uint32_t footprint_variable_interface_crc32(starpu_data_handle handle)
 
				 	return _starpu_crc32_be(starpu_variable_get_elemsize(handle), 0);
			
 
				 }
			
 
				 
			
 
				-static int variable_compare(void *interface_a, void *interface_b)
			
 
				+static int variable_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_variable_interface_t *variable_a = interface_a;
			
 
				-	starpu_variable_interface_t *variable_b = interface_b;
			
 
				+	starpu_variable_interface_t *variable_a = data_interface_a;
			
 
				+	starpu_variable_interface_t *variable_b = data_interface_b;
			
 
				 
			
 
				 	/* Two variables are considered compatible if they have the same size */
			
 
				 	return (variable_a->elemsize == variable_b->elemsize);
			
@@ -149,18 +160,18 @@ static int variable_compare(void *interface_a, void *interface_b)
 
				 
			
 
				 static void display_variable_interface(starpu_data_handle handle, FILE *f)
			
 
				 {
			
 
				-	starpu_variable_interface_t *interface =
			
 
				+	starpu_variable_interface_t *variable_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	fprintf(f, "%ld\t", (long)interface->elemsize);
			
 
				+	fprintf(f, "%ld\t", (long)variable_interface->elemsize);
			
 
				 }
			
 
				 
			
 
				 static size_t variable_interface_get_size(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_variable_interface_t *interface =
			
 
				+	starpu_variable_interface_t *variable_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->elemsize;
			
 
				+	return variable_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				 uintptr_t starpu_variable_get_local_ptr(starpu_data_handle handle)
			
@@ -181,15 +192,15 @@ size_t starpu_variable_get_elemsize(starpu_data_handle handle)
 
				 /* memory allocation/deallocation primitives for the variable interface */
			
 
				 
			
 
				 /* returns the size of the allocated area */
			
 
				-static ssize_t allocate_variable_buffer_on_node(void *interface_, uint32_t dst_node)
			
 
				+static ssize_t allocate_variable_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				-	starpu_variable_interface_t *interface = interface_;
			
 
				+	starpu_variable_interface_t *variable_interface = data_interface_;
			
 
				 
			
 
				 	unsigned fail = 0;
			
 
				 	uintptr_t addr = 0;
			
 
				 	ssize_t allocated_memory;
			
 
				 
			
 
				-	size_t elemsize = interface->elemsize;
			
 
				+	size_t elemsize = variable_interface->elemsize;
			
 
				 
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				 
			
@@ -239,26 +250,26 @@ static ssize_t allocate_variable_buffer_on_node(void *interface_, uint32_t dst_n
 
				 	allocated_memory = elemsize;
			
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				-	interface->ptr = addr;
			
 
				+	variable_interface->ptr = addr;
			
 
				 	
			
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-static void free_variable_buffer_on_node(void *interface, uint32_t node)
			
 
				+static void free_variable_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				 	switch(kind) {
			
 
				 		case STARPU_CPU_RAM:
			
 
				-			free((void*)STARPU_VARIABLE_GET_PTR(interface));
			
 
				+			free((void*)STARPU_VARIABLE_GET_PTR(data_interface));
			
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			cudaFree((void*)STARPU_VARIABLE_GET_PTR(interface));
			
 
				+			cudaFree((void*)STARPU_VARIABLE_GET_PTR(data_interface));
			
 
				 			break;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				                 case STARPU_OPENCL_RAM:
			
 
				-                        clReleaseMemObject((void*)STARPU_VARIABLE_GET_PTR(interface));
			
 
				+                        clReleaseMemObject((void*)STARPU_VARIABLE_GET_PTR(data_interface));
			
 
				                         break;
			
 
				 #endif
			
 
				 		default:
			
@@ -267,8 +278,8 @@ static void free_variable_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable = src_interface;
			
 
				 	starpu_variable_interface_t *dst_variable = dst_interface;
			
@@ -285,26 +296,50 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 }
			
 
				 
			
 
				 
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+		int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+		starpu_variable_interface_t *src_variable = src_interface;
			
 
				+		starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+
			
 
				+		cudaError_t cures;
			
 
				+		cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, src_variable->elemsize);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+		STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+
			
 
				+#else
			
 
				+		/* This is illegal without support for cudaMemcpyPeer */
			
 
				+		STARPU_ABORT();
			
 
				+#endif
			
 
				+		return 0;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)),
			
 
				+static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 					cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable = src_interface;
			
@@ -329,22 +364,63 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 }
			
 
				 
			
 
				 
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				+
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+		int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+		starpu_variable_interface_t *src_variable = src_interface;
			
 
				+		starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+
			
 
				+		size_t length = src_variable->elemsize;
			
 
				+
			
 
				+		cudaError_t cures;
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length, stream);
			
 
				+		if (cures)
			
 
				+		{
			
 
				+			/* sychronous fallback */
			
 
				+			cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length);
			
 
				+			if (STARPU_UNLIKELY(cures))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
			
 
				+
			
 
				+		return -EAGAIN;
			
 
				+#else
			
 
				+		/* This is illegal without cudaMemcpyPeer */
			
 
				+		STARPU_ABORT();
			
 
				+		return 0;
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface,
			
 
				-                                    unsigned dst_node __attribute__((unused)), void *_event)
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface,
			
 
				+                                    unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable = src_interface;
			
 
				 	starpu_variable_interface_t *dst_variable = dst_interface;
			
@@ -360,7 +436,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable = src_interface;
			
 
				 	starpu_variable_interface_t *dst_variable = dst_interface;
			
@@ -377,17 +453,17 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	cl_int err;
			
 
				 
			
@@ -412,7 +488,7 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attrib
 
				 
			
 
				 #endif
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable = src_interface;
			
 
				 	starpu_variable_interface_t *dst_variable = dst_interface;
			
--- a/src/datawizard/interfaces/vector_filters.c
+++ b/src/datawizard/interfaces/vector_filters.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,7 +20,7 @@
 
				 #include <common/config.h>
			
 
				 #include <datawizard/filters.h>
			
 
				 
			
 
				-void starpu_block_filter_func_vector(void *father_interface, void *child_interface, __attribute__((unused)) struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				+void starpu_block_filter_func_vector(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
			
 
				 {
			
 
				         starpu_vector_interface_t *vector_father = father_interface;
			
 
				         starpu_vector_interface_t *vector_child = child_interface;
			
@@ -47,7 +47,7 @@ void starpu_block_filter_func_vector(void *father_interface, void *child_interfa
 
				 }
			
 
				 
			
 
				 
			
 
				-void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, __attribute__((unused)) unsigned nchunks)
			
 
				+void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nchunks)
			
 
				 {
			
 
				         /* there cannot be more than 2 chunks */
			
 
				         STARPU_ASSERT(id < 2);
			
@@ -88,7 +88,7 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 
				 }
			
 
				 
			
 
				 
			
 
				-void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, __attribute__((unused)) unsigned nchunks)
			
 
				+void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nchunks)
			
 
				 {
			
 
				         starpu_vector_interface_t *vector_father = father_interface;
			
 
				         starpu_vector_interface_t *vector_child = child_interface;
			
@@ -107,7 +107,7 @@ void starpu_vector_list_filter_func(void *father_interface, void *child_interfac
 
				 	if (vector_father->ptr) {
			
 
				 	  /* compute the current position */
			
 
				 	  unsigned i;
			
 
				-	  for (i = 0; i <= id; i++) 
			
 
				+	  for (i = 0; i < id; i++) 
			
 
				 	    current_pos += length_tab[i];
			
 
				 	  
			
 
				 	  vector_child->ptr = vector_father->ptr + current_pos*elemsize;
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,20 +25,21 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, void *_event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, void *_event);
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, void *_event);
			
 
				 #endif
			
 
				 
			
 
				 static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
			
@@ -50,6 +51,7 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -64,20 +66,22 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 
				 	.spu_to_spu = NULL
			
 
				 };
			
 
				 
			
 
				-static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
			
 
				-static ssize_t allocate_vector_buffer_on_node(void *interface_, uint32_t dst_node);
			
 
				-static void free_vector_buffer_on_node(void *interface, uint32_t node);
			
 
				+static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				+static void *vector_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
			
 
				+static void free_vector_buffer_on_node(void *data_interface, uint32_t node);
			
 
				 static size_t vector_interface_get_size(starpu_data_handle handle);
			
 
				 static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle);
			
 
				-static int vector_compare(void *interface_a, void *interface_b);
			
 
				+static int vector_compare(void *data_interface_a, void *data_interface_b);
			
 
				 static void display_vector_interface(starpu_data_handle handle, FILE *f);
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-static int convert_vector_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				+static int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
			
 
				 #endif
			
 
				 
			
 
				 static struct starpu_data_interface_ops_t interface_vector_ops = {
			
 
				 	.register_data_handle = register_vector_handle,
			
 
				 	.allocate_data_on_node = allocate_vector_buffer_on_node,
			
 
				+	.handle_to_pointer = vector_handle_to_pointer,
			
 
				 	.free_data_on_node = free_vector_buffer_on_node,
			
 
				 	.copy_methods = &vector_copy_data_methods_s,
			
 
				 	.get_size = vector_interface_get_size,
			
@@ -91,9 +95,19 @@ static struct starpu_data_interface_ops_t interface_vector_ops = {
 
				 	.display = display_vector_interface
			
 
				 };
			
 
				 
			
 
				-static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
			
 
				+static void *vector_handle_to_pointer(starpu_data_handle handle, uint32_t node)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = interface;
			
 
				+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				+
			
 
				+	starpu_vector_interface_t *vector_interface =
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+	return (void*) vector_interface->ptr;
			
 
				+}
			
 
				+
			
 
				+static void register_vector_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
			
 
				+{
			
 
				+	starpu_vector_interface_t *vector_interface = data_interface;
			
 
				 
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
@@ -118,7 +132,7 @@ static void register_vector_handle(starpu_data_handle handle, uint32_t home_node
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				-int convert_vector_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				+int convert_vector_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss) 
			
 
				 {
			
 
				 	starpu_vector_interface_t *vector_interface = interface;
			
 
				 	
			
@@ -150,10 +164,10 @@ static uint32_t footprint_vector_interface_crc32(starpu_data_handle handle)
 
				 	return _starpu_crc32_be(starpu_vector_get_nx(handle), 0);
			
 
				 }
			
 
				 
			
 
				-static int vector_compare(void *interface_a, void *interface_b)
			
 
				+static int vector_compare(void *data_interface_a, void *data_interface_b)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_a = interface_a;
			
 
				-	starpu_vector_interface_t *vector_b = interface_b;
			
 
				+	starpu_vector_interface_t *vector_a = data_interface_a;
			
 
				+	starpu_vector_interface_t *vector_b = data_interface_b;
			
 
				 
			
 
				 	/* Two vectors are considered compatible if they have the same size */
			
 
				 	return ((vector_a->nx == vector_b->nx)
			
@@ -162,19 +176,19 @@ static int vector_compare(void *interface_a, void *interface_b)
 
				 
			
 
				 static void display_vector_interface(starpu_data_handle handle, FILE *f)
			
 
				 {
			
 
				-	starpu_vector_interface_t *interface =
			
 
				+	starpu_vector_interface_t *vector_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	fprintf(f, "%u\t", interface->nx);
			
 
				+	fprintf(f, "%u\t", vector_interface->nx);
			
 
				 }
			
 
				 
			
 
				 static size_t vector_interface_get_size(starpu_data_handle handle)
			
 
				 {
			
 
				 	size_t size;
			
 
				-	starpu_vector_interface_t *interface =
			
 
				+	starpu_vector_interface_t *vector_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	size = interface->nx*interface->elemsize;
			
 
				+	size = vector_interface->nx*vector_interface->elemsize;
			
 
				 
			
 
				 	return size;
			
 
				 }
			
@@ -182,10 +196,10 @@ static size_t vector_interface_get_size(starpu_data_handle handle)
 
				 /* offer an access to the data parameters */
			
 
				 uint32_t starpu_vector_get_nx(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_vector_interface_t *interface =
			
 
				+	starpu_vector_interface_t *vector_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->nx;
			
 
				+	return vector_interface->nx;
			
 
				 }
			
 
				 
			
 
				 uintptr_t starpu_vector_get_local_ptr(starpu_data_handle handle)
			
@@ -195,33 +209,33 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle handle)
 
				 
			
 
				 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				 
			
 
				-	starpu_vector_interface_t *interface =
			
 
				+	starpu_vector_interface_t *vector_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				-	return interface->ptr;
			
 
				+	return vector_interface->ptr;
			
 
				 }
			
 
				 
			
 
				 size_t starpu_vector_get_elemsize(starpu_data_handle handle)
			
 
				 {
			
 
				-	starpu_vector_interface_t *interface =
			
 
				+	starpu_vector_interface_t *vector_interface =
			
 
				 		starpu_data_get_interface_on_node(handle, 0);
			
 
				 
			
 
				-	return interface->elemsize;
			
 
				+	return vector_interface->elemsize;
			
 
				 }
			
 
				 
			
 
				 /* memory allocation/deallocation primitives for the vector interface */
			
 
				 
			
 
				 /* returns the size of the allocated area */
			
 
				-static ssize_t allocate_vector_buffer_on_node(void *interface_, uint32_t dst_node)
			
 
				+static ssize_t allocate_vector_buffer_on_node(void *data_interface_, uint32_t dst_node)
			
 
				 {
			
 
				-	starpu_vector_interface_t *interface = interface_;
			
 
				+	starpu_vector_interface_t *vector_interface = data_interface_;
			
 
				 
			
 
				 	unsigned fail = 0;
			
 
				 	uintptr_t addr = 0;
			
 
				 	ssize_t allocated_memory;
			
 
				 
			
 
				-	uint32_t nx = interface->nx;
			
 
				-	size_t elemsize = interface->elemsize;
			
 
				+	uint32_t nx = vector_interface->nx;
			
 
				+	size_t elemsize = vector_interface->elemsize;
			
 
				 
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(dst_node);
			
 
				 
			
@@ -271,16 +285,20 @@ static ssize_t allocate_vector_buffer_on_node(void *interface_, uint32_t dst_nod
 
				 	allocated_memory = nx*elemsize;
			
 
				 
			
 
				 	/* update the data properly in consequence */
			
 
				-	interface->ptr = addr;
			
 
				-        interface->dev_handle = addr;
			
 
				-        interface->offset = 0;
			
 
				+	vector_interface->ptr = addr;
			
 
				+        vector_interface->dev_handle = addr;
			
 
				+        vector_interface->offset = 0;
			
 
				 	
			
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-static void free_vector_buffer_on_node(void *interface, uint32_t node)
			
 
				+static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
			
 
				 {
			
 
				-	starpu_vector_interface_t *vector_interface = interface;
			
 
				+	starpu_vector_interface_t *vector_interface = data_interface;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				 
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				 	switch(kind) {
			
@@ -289,7 +307,8 @@ static void free_vector_buffer_on_node(void *interface, uint32_t node)
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			cudaFree((void*)vector_interface->ptr);
			
 
				+			cures = cudaFree((void*)vector_interface->ptr);
			
 
				+			STARPU_ASSERT(cures == cudaSuccess);
			
 
				 			break;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -303,13 +322,14 @@ static void free_vector_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector = src_interface;
			
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				+
			
 
				 	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -319,33 +339,80 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
			
 
				+				void *dst_interface, unsigned dst_node,
			
 
				+				int is_async, cudaStream_t stream)
			
 
				+{
			
 
				+	cudaError_t cures;
			
 
				 
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+	starpu_vector_interface_t *src_vector = src_interface;
			
 
				+	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				+
			
 
				+	size_t length = src_vector->nx*src_vector->elemsize;
			
 
				+
			
 
				+	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_vector->ptr, dst_dev,
			
 
				+						(char *)src_vector->ptr, src_dev,
			
 
				+						length, stream);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	cures = cudaMemcpyPeer((char *)dst_vector->ptr, dst_dev,
			
 
				+				(char *)src_vector->ptr, src_dev, length);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 0, 0);
			
 
				+#else
			
 
				+		/* This is illegal without cudaMemcpyPeer */
			
 
				+		STARPU_ABORT();
			
 
				+		return 0;
			
 
				+#endif
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)),
			
 
				+static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 					cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector = src_interface;
			
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				+
			
 
				 	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
			
 
				 	if (cures)
			
 
				 	{
			
@@ -362,15 +429,31 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 	return -EAGAIN;
			
 
				 }
			
 
				 
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 1, stream);
			
 
				+#else
			
 
				+		/* This is illegal without cudaMemcpyPeer */
			
 
				+		STARPU_ABORT();
			
 
				+		return 0;
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
@@ -378,8 +461,8 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-                                    void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+                                    void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector = src_interface;
			
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
@@ -396,8 +479,8 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __att
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-                                    void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+                                    void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector = src_interface;
			
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
@@ -413,20 +496,20 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __att
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-                              void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-                              void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				         int err;
			
 
				 
			
@@ -450,8 +533,8 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node __attrib
 
				 
			
 
				 #endif // STARPU_USE_OPENCL
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector = src_interface;
			
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -38,8 +39,10 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.ram_to_cuda = dummy_copy,
			
 
				 	.cuda_to_ram = dummy_copy,
			
 
				+	.cuda_to_cuda = dummy_copy,
			
 
				 	.ram_to_cuda_async = dummy_cuda_copy_async,
			
 
				 	.cuda_to_ram_async = dummy_cuda_copy_async,
			
 
				+	.cuda_to_cuda_async = dummy_cuda_copy_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = dummy_copy,
			
@@ -47,19 +50,18 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 
				         .ram_to_opencl_async = dummy_opencl_copy_async,
			
 
				 	.opencl_to_ram_async = dummy_opencl_copy_async,
			
 
				 #endif
			
 
				-	.cuda_to_cuda = dummy_copy,
			
 
				 	.cuda_to_spu = dummy_copy,
			
 
				 	.spu_to_ram = dummy_copy,
			
 
				 	.spu_to_cuda = dummy_copy,
			
 
				 	.spu_to_spu = dummy_copy
			
 
				 };
			
 
				 
			
 
				-static void register_void_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
			
 
				-static ssize_t allocate_void_buffer_on_node(void *interface_, uint32_t dst_node);
			
 
				-static void free_void_buffer_on_node(void *interface, uint32_t node);
			
 
				+static void register_void_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
			
 
				+static ssize_t allocate_void_buffer_on_node(void *data_interface_, uint32_t dst_node);
			
 
				+static void free_void_buffer_on_node(void *data_interface, uint32_t node);
			
 
				 static size_t void_interface_get_size(starpu_data_handle handle);
			
 
				 static uint32_t footprint_void_interface_crc32(starpu_data_handle handle);
			
 
				-static int void_compare(void *interface_a, void *interface_b);
			
 
				+static int void_compare(void *data_interface_a, void *data_interface_b);
			
 
				 static void display_void_interface(starpu_data_handle handle, FILE *f);
			
 
				 
			
 
				 static struct starpu_data_interface_ops_t interface_void_ops = {
			
@@ -75,9 +77,9 @@ static struct starpu_data_interface_ops_t interface_void_ops = {
 
				 	.display = display_void_interface
			
 
				 };
			
 
				 
			
 
				-static void register_void_handle(starpu_data_handle handle __attribute__((unused)),
			
 
				-				uint32_t home_node __attribute__((unused)),
			
 
				-				void *interface __attribute__((unused)))
			
 
				+static void register_void_handle(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED,
			
 
				+				uint32_t home_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *data_interface STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	/* Since there is no real data to register, we don't do anything */
			
 
				 }
			
@@ -89,25 +91,25 @@ void starpu_void_data_register(starpu_data_handle *handleptr)
 
				 }
			
 
				 
			
 
				 
			
 
				-static uint32_t footprint_void_interface_crc32(starpu_data_handle handle __attribute__((unused)))
			
 
				+static uint32_t footprint_void_interface_crc32(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int void_compare(void *interface_a __attribute__((unused)),
			
 
				-			void *interface_b __attribute__((unused)))
			
 
				+static int void_compare(void *data_interface_a STARPU_ATTRIBUTE_UNUSED,
			
 
				+			void *data_interface_b STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	/* There is no allocation required, and therefore nothing to cache
			
 
				 	 * anyway. */
			
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static void display_void_interface(starpu_data_handle handle __attribute__((unused)), FILE *f)
			
 
				+static void display_void_interface(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED, FILE *f)
			
 
				 {
			
 
				 	fprintf(f, "void\t");
			
 
				 }
			
 
				 
			
 
				-static size_t void_interface_get_size(starpu_data_handle handle __attribute__((unused)))
			
 
				+static size_t void_interface_get_size(starpu_data_handle handle STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
@@ -115,32 +117,32 @@ static size_t void_interface_get_size(starpu_data_handle handle __attribute__((u
 
				 /* memory allocation/deallocation primitives for the void interface */
			
 
				 
			
 
				 /* returns the size of the allocated area */
			
 
				-static ssize_t allocate_void_buffer_on_node(void *interface __attribute__((unused)),
			
 
				-					uint32_t dst_node __attribute__((unused)))
			
 
				+static ssize_t allocate_void_buffer_on_node(void *data_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				+					uint32_t dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	/* Successfuly allocated 0 bytes */
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static void free_void_buffer_on_node(void *interface __attribute__((unused)) ,
			
 
				-					uint32_t node __attribute__((unused)))
			
 
				+static void free_void_buffer_on_node(void *data_interface STARPU_ATTRIBUTE_UNUSED ,
			
 
				+					uint32_t node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	/* There is no buffer actually */
			
 
				 }
			
 
				 
			
 
				-static int dummy_copy(void *src_interface __attribute__((unused)),
			
 
				-			unsigned src_node __attribute__((unused)),
			
 
				-			void *dst_interface __attribute__((unused)),
			
 
				-			unsigned dst_node __attribute__((unused)))
			
 
				+static int dummy_copy(void *src_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				+			unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+			void *dst_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				+			unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int dummy_cuda_copy_async(void *src_interface __attribute__((unused)),
			
 
				-				unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface __attribute__((unused)),
			
 
				-				unsigned dst_node __attribute__((unused)),
			
 
				+static int dummy_cuda_copy_async(void *src_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				+				unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+				void *dst_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				+				unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
			
 
				 				cudaStream_t stream __attribute__ ((unused)))
			
 
				 {
			
 
				 	return 0;
			
@@ -148,11 +150,11 @@ static int dummy_cuda_copy_async(void *src_interface __attribute__((unused)),
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int dummy_opencl_copy_async(void *src_interface __attribute__((unused)),
			
 
				-					unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface __attribute__((unused)),
			
 
				-					unsigned dst_node __attribute__((unused)),
			
 
				-					void *_event __attribute__((unused)))
			
 
				+static int dummy_opencl_copy_async(void *src_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				+					unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *dst_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				+					unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
			
 
				+					void *_event STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -134,7 +134,9 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 
				 			src_replicate->state = STARPU_INVALID;
			
 
				 			dst_replicate->state = STARPU_OWNER;
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning we should use requests during memory reclaim
			
 
				+#endif
			
 
				 			/* TODO use request !! */
			
 
				 			src_replicate->refcnt++;
			
 
				 			dst_replicate->refcnt++;
			
@@ -201,7 +203,9 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 
				 //	while (_starpu_spin_trylock(&handle->header_lock))
			
 
				 //		_starpu_datawizard_progress(_starpu_get_local_memory_node());
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning can we block here ?
			
 
				+#endif
			
 
				 //	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				 	if (mc->automatically_allocated && 
			
@@ -210,6 +214,18 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 
				 		if (handle && !data_was_deleted)
			
 
				 			STARPU_ASSERT(replicate->allocated);
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+		if (_starpu_get_node_kind(node) == STARPU_CUDA_RAM)
			
 
				+		{
			
 
				+			/* To facilitate the design of interface, we set the
			
 
				+			 * proper CUDA device in case it is needed. This avoids
			
 
				+			 * having to set it again in the free method of each
			
 
				+			 * interface. */
			
 
				+			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(node));
			
 
				+			STARPU_ASSERT(err == cudaSuccess);
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				 		mc->ops->free_data_on_node(mc->chunk_interface, node);
			
 
				 
			
 
				 		if (handle && !data_was_deleted)
			
@@ -379,8 +395,8 @@ static unsigned try_to_reuse_mem_chunk(starpu_mem_chunk_t mc, unsigned node, sta
 
				 	return success;
			
 
				 }
			
 
				 
			
 
				-static int _starpu_data_interface_compare(void *interface_a, struct starpu_data_interface_ops_t *ops_a,
			
 
				-						void *interface_b, struct starpu_data_interface_ops_t *ops_b)
			
 
				+static int _starpu_data_interface_compare(void *data_interface_a, struct starpu_data_interface_ops_t *ops_a,
			
 
				+                                          void *data_interface_b, struct starpu_data_interface_ops_t *ops_b)
			
 
				 {
			
 
				 	if (ops_a->interfaceid != ops_b->interfaceid)
			
 
				 		return -1;
			
@@ -674,6 +690,19 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct star
 
				 
			
 
				 		STARPU_TRACE_START_ALLOC(dst_node);
			
 
				 		STARPU_ASSERT(replicate->data_interface);
			
 
				+
			
 
				+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+		if (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				+		{
			
 
				+			/* To facilitate the design of interface, we set the
			
 
				+			 * proper CUDA device in case it is needed. This avoids
			
 
				+			 * having to set it again in the malloc method of each
			
 
				+			 * interface. */
			
 
				+			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(dst_node));
			
 
				+			STARPU_ASSERT(err == cudaSuccess);
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				 		allocated_memory = handle->ops->allocate_data_on_node(replicate->data_interface, dst_node);
			
 
				 		STARPU_TRACE_END_ALLOC(dst_node);
			
 
				 
			
@@ -721,6 +750,15 @@ int _starpu_allocate_memory_on_node(starpu_data_handle handle, struct starpu_dat
 
				 	replicate->allocated = 1;
			
 
				 	replicate->automatically_allocated = 1;
			
 
				 
			
 
				+	if (dst_node == 0)
			
 
				+	{
			
 
				+		void *ptr = starpu_handle_to_pointer(handle, 0);
			
 
				+		if (ptr != NULL)
			
 
				+		{
			
 
				+			_starpu_data_register_ram_pointer(handle, ptr);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -81,12 +81,17 @@ inline starpu_node_kind _starpu_get_node_kind(uint32_t node)
 
				 	return descr.nodes[node];
			
 
				 }
			
 
				 
			
 
				+int starpu_memory_node_to_devid(unsigned node)
			
 
				+{
			
 
				+	return descr.devid[node];
			
 
				+}
			
 
				+
			
 
				 unsigned _starpu_get_memory_nodes_count(void)
			
 
				 {
			
 
				 	return descr.nnodes;
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_register_memory_node(starpu_node_kind kind)
			
 
				+unsigned _starpu_register_memory_node(starpu_node_kind kind, int devid)
			
 
				 {
			
 
				 	unsigned nnodes;
			
 
				 	/* ATOMIC_ADD returns the new value ... */
			
@@ -95,6 +100,8 @@ unsigned _starpu_register_memory_node(starpu_node_kind kind)
 
				 	descr.nodes[nnodes-1] = kind;
			
 
				 	STARPU_TRACE_NEW_MEM_NODE(nnodes-1);
			
 
				 
			
 
				+	descr.devid[nnodes-1] = devid;
			
 
				+
			
 
				 	/* for now, there is no condition associated to that newly created node */
			
 
				 	descr.condition_count[nnodes-1] = 0;
			
 
				 
			
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h
@@ -46,6 +46,9 @@ typedef struct {
 
				 	unsigned nnodes;
			
 
				 	starpu_node_kind nodes[STARPU_MAXNODES];
			
 
				 
			
 
				+	/* Get the device id associated to this node, or -1 if not applicable */
			
 
				+	int devid[STARPU_MAXNODES];
			
 
				+
			
 
				 	// TODO move this 2 lists outside starpu_mem_node_descr
			
 
				 	/* Every worker is associated to a condition variable on which the
			
 
				 	 * worker waits when there is task available. It is possible that
			
@@ -65,11 +68,12 @@ void _starpu_init_memory_nodes(void);
 
				 void _starpu_deinit_memory_nodes(void);
			
 
				 void _starpu_set_local_memory_node_key(unsigned *node);
			
 
				 unsigned _starpu_get_local_memory_node(void);
			
 
				-unsigned _starpu_register_memory_node(starpu_node_kind kind);
			
 
				+unsigned _starpu_register_memory_node(starpu_node_kind kind, int devid);
			
 
				 //void _starpu_memory_node_attach_queue(struct starpu_jobq_s *q, unsigned nodeid);
			
 
				 void _starpu_memory_node_register_condition(pthread_cond_t *cond, pthread_mutex_t *mutex, unsigned memory_node);
			
 
				 
			
 
				 starpu_node_kind _starpu_get_node_kind(uint32_t node);
			
 
				+int starpu_memory_node_to_devid(unsigned node);
			
 
				 unsigned _starpu_get_memory_nodes_count(void);
			
 
				 
			
 
				 starpu_mem_node_descr *_starpu_get_memory_node_description(void);
			
--- a/src/datawizard/sort_data_handles.c
+++ b/src/datawizard/sort_data_handles.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,6 +19,7 @@
 
				 #include <common/config.h>
			
 
				 
			
 
				 #include <datawizard/filters.h>
			
 
				+#include <datawizard/sort_data_handles.h>
			
 
				 
			
 
				 /* To avoid deadlocks in case we have multiple tasks accessing the same piece
			
 
				  * of data  (eg. task T1 needs A and B, and T2 needs B and A), we need to lock
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,7 +31,7 @@ int starpu_data_request_allocation(starpu_data_handle handle, uint32_t node)
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, 0, 0, 1);
			
 
				+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, 0, 0);
			
 
				 
			
 
				 	/* we do not increase the refcnt associated to the request since we are
			
 
				 	 * not waiting for its termination */
			
@@ -125,7 +125,9 @@ int starpu_data_acquire_cb(starpu_data_handle handle,
 
				 	PTHREAD_MUTEX_INIT(&wrapper->lock, NULL);
			
 
				 	wrapper->finished = 0;
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning TODO instead of having the is_prefetch argument, _starpu_fetch_data shoud consider two flags: async and detached
			
 
				+#endif
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 	handle->per_node[0].refcnt++;
			
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
@@ -305,6 +307,7 @@ static void _prefetch_data_on_node(void *arg)
 
				 
			
 
				 }
			
 
				 
			
 
				+static
			
 
				 int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle handle, unsigned node, unsigned async, starpu_access_mode mode)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
@@ -413,7 +416,9 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 
				 /* Query the status of the handle on the specified memory node. */
			
 
				 void starpu_data_query_status(starpu_data_handle handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
			
 
				 {
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning FIXME
			
 
				+#endif
			
 
				 //	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				 	if (is_allocated)
			
@@ -423,7 +428,21 @@ void starpu_data_query_status(starpu_data_handle handle, int memory_node, int *i
 
				 		*is_valid = (handle->per_node[memory_node].state != STARPU_INVALID);
			
 
				 
			
 
				 	if (is_requested)
			
 
				-		*is_requested = handle->per_node[memory_node].requested;
			
 
				+	{
			
 
				+		int requested = 0;
			
 
				+
			
 
				+		unsigned node;
			
 
				+		for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+		{
			
 
				+			if (handle->per_node[memory_node].requested[node])
			
 
				+			{
			
 
				+				requested = 1;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		*is_requested = requested;
			
 
				+	}
			
 
				 
			
 
				 //	_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <datawizard/datawizard.h>
			
 
				+#include <datawizard/write_back.h>
			
 
				 
			
 
				 void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_node, 
			
 
				 					   uint32_t write_through_mask)
			
@@ -25,9 +26,6 @@ void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_n
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	while (_starpu_spin_trylock(&handle->header_lock))
			
 
				-		_starpu_datawizard_progress(requesting_node, 1);
			
 
				-
			
 
				 	/* first commit all changes onto the nodes specified by the mask */
			
 
				 	uint32_t node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
@@ -36,30 +34,23 @@ void _starpu_write_through_data(starpu_data_handle handle, uint32_t requesting_n
 
				 			/* we need to commit the buffer on that node */
			
 
				 			if (node != requesting_node) 
			
 
				 			{
			
 
				-				uint32_t handling_node =
			
 
				-					_starpu_select_node_to_handle_request(requesting_node, node);
			
 
				+				while (_starpu_spin_trylock(&handle->header_lock))
			
 
				+					_starpu_datawizard_progress(requesting_node, 1);
			
 
				 
			
 
				 				starpu_data_request_t r;
			
 
				+				r = create_request_to_fetch_data(handle, &handle->per_node[node],
			
 
				+								STARPU_R, 0, NULL, NULL);
			
 
				 
			
 
				-				/* check that there is not already a similar
			
 
				-				 * request that we should reuse */
			
 
				-				r = _starpu_search_existing_data_request(&handle->per_node[node], STARPU_R);
			
 
				-				if (!r) {
			
 
				-					/* there was no existing request so we create one now */
			
 
				-					r = _starpu_create_data_request(handle, &handle->per_node[requesting_node],
			
 
				-							&handle->per_node[node], handling_node, STARPU_R, 0, 1);
			
 
				-					_starpu_post_data_request(r, handling_node);
			
 
				-				}
			
 
				-				else {
			
 
				-					/* if there is already a similar request, it is
			
 
				-					 * useless to post another one */
			
 
				-					_starpu_spin_unlock(&r->lock);
			
 
				+			        /* If no request was created, the handle was already up-to-date on the
			
 
				+			         * node */
			
 
				+			        if (r)
			
 
				+				{
			
 
				+				        _starpu_spin_unlock(&handle->header_lock);
			
 
				+        				_starpu_wait_data_request_completion(r, 1);
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				-	_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				 void starpu_data_set_wt_mask(starpu_data_handle handle, uint32_t wt_mask)
			
--- a/src/debug/structures_size.c
+++ b/src/debug/structures_size.c
@@ -19,19 +19,20 @@
 
				 #include <core/workers.h>
			
 
				 #include <datawizard/coherency.h>
			
 
				 #include <profiling/bound.h>
			
 
				+#include <debug/starpu_debug_helpers.h>
			
 
				 
			
 
				 void _starpu_debug_display_structures_size(void)
			
 
				 {
			
 
				-	fprintf(stderr, "struct starpu_task\t\t%d bytes\t(%x)\n",
			
 
				+	fprintf(stderr, "struct starpu_task\t\t%u bytes\t(%x)\n",
			
 
				 			(unsigned) sizeof(struct starpu_task), (unsigned) sizeof(struct starpu_task));
			
 
				-	fprintf(stderr, "struct starpu_job_s\t\t%d bytes\t(%x)\n",
			
 
				+	fprintf(stderr, "struct starpu_job_s\t\t%u bytes\t(%x)\n",
			
 
				 			(unsigned) sizeof(struct starpu_job_s), (unsigned) sizeof(struct starpu_job_s));
			
 
				-	fprintf(stderr, "struct starpu_data_state_t\t%d bytes\t(%x)\n",
			
 
				+	fprintf(stderr, "struct starpu_data_state_t\t%u bytes\t(%x)\n",
			
 
				 			(unsigned) sizeof(struct starpu_data_state_t), (unsigned) sizeof(struct starpu_data_state_t));
			
 
				-	fprintf(stderr, "struct starpu_tag_s\t\t%d bytes\t(%x)\n",
			
 
				+	fprintf(stderr, "struct starpu_tag_s\t\t%u bytes\t(%x)\n",
			
 
				 			(unsigned) sizeof(struct starpu_tag_s), (unsigned) sizeof(struct starpu_tag_s));
			
 
				-	fprintf(stderr, "struct starpu_cg_s\t\t%d bytes\t(%x)\n",
			
 
				+	fprintf(stderr, "struct starpu_cg_s\t\t%u bytes\t(%x)\n",
			
 
				 			(unsigned) sizeof(struct starpu_cg_s), (unsigned) sizeof(struct starpu_cg_s));
			
 
				-	fprintf(stderr, "struct starpu_worker_s\t\t%d bytes\t(%x)\n",
			
 
				+	fprintf(stderr, "struct starpu_worker_s\t\t%u bytes\t(%x)\n",
			
 
				 			(unsigned) sizeof(struct starpu_worker_s), (unsigned) sizeof(struct starpu_worker_s));
			
 
				 }
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
--- a/src/debug/traces/starpu_fxt.h
+++ b/src/debug/traces/starpu_fxt.h
@@ -0,0 +1,63 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_FXT_H__
			
 
				+#define __STARPU_FXT_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_config.h>
			
 
				+#include <common/config.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+
			
 
				+#include <search.h>
			
 
				+
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/stat.h>
			
 
				+#include <fcntl.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#include <common/fxt.h>
			
 
				+#include <common/list.h>
			
 
				+#include "../mpi/starpu_mpi_fxt.h"
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define FACTOR  100
			
 
				+
			
 
				+void starpu_fxt_dag_init(char *dag_filename);
			
 
				+void starpu_fxt_dag_terminate(void);
			
 
				+void starpu_fxt_dag_add_tag_deps(uint64_t child, uint64_t father);
			
 
				+void starpu_fxt_dag_set_tag_done(uint64_t tag, const char *color);
			
 
				+void starpu_fxt_dag_add_task_deps(unsigned long dep_prev, unsigned long dep_succ);
			
 
				+void starpu_fxt_dag_set_task_done(unsigned long job_id, const char *label, const char *color);
			
 
				+void starpu_fxt_dag_add_sync_point(void);
			
 
				+
			
 
				+/*
			
 
				+ *	MPI
			
 
				+ */
			
 
				+
			
 
				+int starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *key, int *rank);
			
 
				+void starpu_fxt_mpi_add_send_transfer(int src, int dst, int mpi_tag, size_t size, float date);
			
 
				+void starpu_fxt_mpi_add_recv_transfer(int src, int dst, int mpi_tag, float date);
			
 
				+void starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks, FILE *out_paje_file);
			
 
				+
			
 
				+void starpu_fxt_write_paje_header(FILE *file);
			
 
				+
			
 
				+#endif // STARPU_USE_FXT
			
 
				+
			
 
				+#endif // __STARPU_FXT_H__
			
--- a/src/debug/traces/starpu_fxt_dag.c
+++ b/src/debug/traces/starpu_fxt_dag.c
@@ -0,0 +1,107 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <common/config.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+
			
 
				+#include "starpu_fxt.h"
			
 
				+
			
 
				+static FILE *out_file;
			
 
				+static unsigned cluster_cnt;
			
 
				+
			
 
				+void starpu_fxt_dag_init(char *out_path)
			
 
				+{
			
 
				+	if (!out_path)
			
 
				+	{
			
 
				+		out_file = NULL;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* create a new file */
			
 
				+	out_file = fopen(out_path, "w+");
			
 
				+	if (!out_file) {
			
 
				+		fprintf(stderr,"error while opening %s\n", out_path);
			
 
				+		perror("fopen");
			
 
				+		exit(1);
			
 
				+	}
			
 
				+	cluster_cnt = 0;
			
 
				+
			
 
				+	fprintf(out_file, "digraph G {\n");
			
 
				+	fprintf(out_file, "\tcolor=white\n");
			
 
				+	fprintf(out_file, "\trankdir=LR;\n");
			
 
				+
			
 
				+	/* Create a new cluster */
			
 
				+	fprintf(out_file, "subgraph cluster_%u {\n", cluster_cnt);
			
 
				+	fprintf(out_file, "\tcolor=black;\n");
			
 
				+}
			
 
				+
			
 
				+void starpu_fxt_dag_terminate(void)
			
 
				+{
			
 
				+	if (!out_file)
			
 
				+		return;
			
 
				+
			
 
				+	/* Close the last cluster */
			
 
				+	fprintf(out_file, "}\n");
			
 
				+	/* Close the graph */
			
 
				+	fprintf(out_file, "}\n");
			
 
				+	fclose(out_file);
			
 
				+}
			
 
				+
			
 
				+void starpu_fxt_dag_add_tag_deps(uint64_t child, uint64_t father)
			
 
				+{
			
 
				+	if (out_file)
			
 
				+	fprintf(out_file, "\t \"tag_%llx\"->\"tag_%llx\"\n", 
			
 
				+		(unsigned long long)father, (unsigned long long)child);
			
 
				+}
			
 
				+
			
 
				+void starpu_fxt_dag_add_task_deps(unsigned long dep_prev, unsigned long dep_succ)
			
 
				+{
			
 
				+	if (out_file)
			
 
				+	fprintf(out_file, "\t \"task_%lx\"->\"task_%lx\"\n", dep_prev, dep_succ);
			
 
				+} 
			
 
				+
			
 
				+void starpu_fxt_dag_set_tag_done(uint64_t tag, const char *color)
			
 
				+{
			
 
				+	if (out_file)
			
 
				+	fprintf(out_file, "\t \"tag_%llx\" [ style=filled, label=\"\", color=\"%s\"]\n", 
			
 
				+		(unsigned long long)tag, color);
			
 
				+}
			
 
				+
			
 
				+void starpu_fxt_dag_set_task_done(unsigned long job_id, const char *label, const char *color)
			
 
				+{
			
 
				+	if (out_file)
			
 
				+	fprintf(out_file, "\t \"task_%lx\" [ style=filled, label=\"%s\", color=\"%s\"]\n", job_id, label, color);
			
 
				+}
			
 
				+
			
 
				+void starpu_fxt_dag_add_sync_point(void)
			
 
				+{
			
 
				+	if (!out_file)
			
 
				+		return;
			
 
				+
			
 
				+	/* Close the previous cluster */
			
 
				+	fprintf(out_file, "}\n");
			
 
				+
			
 
				+	cluster_cnt++;
			
 
				+
			
 
				+	/* Create a new cluster */
			
 
				+	fprintf(out_file, "subgraph cluster_%u {\n", cluster_cnt);
			
 
				+	fprintf(out_file, "\tcolor=black;\n");
			
 
				+}
			
 
				+
			
 
				+#endif /* STARPU_USE_FXT */
			
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -0,0 +1,239 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <common/config.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+
			
 
				+#include "starpu_fxt.h"
			
 
				+
			
 
				+struct mpi_transfer {
			
 
				+	unsigned matched;
			
 
				+	int other_rank; /* src for a recv, dest for a send */
			
 
				+	int mpi_tag;
			
 
				+	size_t size;
			
 
				+	float date;
			
 
				+};
			
 
				+
			
 
				+/* Returns 0 if a barrier is found, -1 otherwise. In case of success, offset is
			
 
				+ * filled with the timestamp of the barrier */
			
 
				+int starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *key, int *rank)
			
 
				+{
			
 
				+	STARPU_ASSERT(offset);
			
 
				+
			
 
				+	/* Open the trace file */
			
 
				+	int fd_in;
			
 
				+	fd_in = open(filename_in, O_RDONLY);
			
 
				+	if (fd_in < 0) {
			
 
				+	        perror("open failed :");
			
 
				+	        exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	static fxt_t fut;
			
 
				+	fut = fxt_fdopen(fd_in);
			
 
				+	if (!fut) {
			
 
				+	        perror("fxt_fdopen :");
			
 
				+	        exit(-1);
			
 
				+	}
			
 
				+	
			
 
				+	fxt_blockev_t block;
			
 
				+	block = fxt_blockev_enter(fut);
			
 
				+
			
 
				+	struct fxt_ev_64 ev;
			
 
				+
			
 
				+	int func_ret = -1;
			
 
				+	unsigned found = 0;
			
 
				+	while(!found) {
			
 
				+		int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
			
 
				+		if (ret != FXT_EV_OK) {
			
 
				+			fprintf(stderr, "no more block ...\n");
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		if (ev.code == FUT_MPI_BARRIER)
			
 
				+		{
			
 
				+			/* We found the sync point */
			
 
				+			*offset = ev.time;
			
 
				+			*rank = ev.param[0];
			
 
				+			*key = ev.param[2];
			
 
				+			found = 1;
			
 
				+			func_ret = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Close the trace file */
			
 
				+	if (close(fd_in))
			
 
				+	{
			
 
				+	        perror("close failed :");
			
 
				+	        exit(-1);
			
 
				+	}
			
 
				+
			
 
				+	return func_ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Deal with the actual MPI transfers performed with the MPI lib
			
 
				+ */
			
 
				+
			
 
				+/* the list of MPI transfers found in the different traces */
			
 
				+static struct mpi_transfer *mpi_sends[64] = {NULL};
			
 
				+static struct mpi_transfer *mpi_recvs[64] = {NULL};
			
 
				+
			
 
				+/* number of available slots in the lists  */
			
 
				+unsigned mpi_sends_list_size[64] = {0};
			
 
				+unsigned mpi_recvs_list_size[64] = {0};
			
 
				+
			
 
				+/* number of slots actually used in the list  */
			
 
				+unsigned mpi_sends_used[64] = {0};
			
 
				+unsigned mpi_recvs_used[64] = {0};
			
 
				+
			
 
				+/* number of slots already matched at the beginning of the list. This permits
			
 
				+ * going through the lists from the beginning to match each and every
			
 
				+ * transfer, thus avoiding a quadratic complexity. */
			
 
				+unsigned mpi_recvs_matched[64] = {0};
			
 
				+
			
 
				+void starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, int mpi_tag, size_t size, float date)
			
 
				+{
			
 
				+	unsigned slot = mpi_sends_used[src]++;
			
 
				+
			
 
				+	if (mpi_sends_used[src] > mpi_sends_list_size[src])
			
 
				+	{
			
 
				+		if (mpi_sends_list_size[src] > 0)
			
 
				+		{
			
 
				+			mpi_sends_list_size[src] *= 2;
			
 
				+		}
			
 
				+		else {
			
 
				+			mpi_sends_list_size[src] = 1;
			
 
				+		}
			
 
				+
			
 
				+		mpi_sends[src] = realloc(mpi_sends[src], mpi_sends_list_size[src]*sizeof(struct mpi_transfer));
			
 
				+	}
			
 
				+
			
 
				+	mpi_sends[src][slot].matched = 0;
			
 
				+	mpi_sends[src][slot].other_rank = dst;
			
 
				+	mpi_sends[src][slot].mpi_tag = mpi_tag;
			
 
				+	mpi_sends[src][slot].size = size;
			
 
				+	mpi_sends[src][slot].date = date;
			
 
				+}
			
 
				+
			
 
				+void starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag, float date)
			
 
				+{
			
 
				+	unsigned slot = mpi_recvs_used[dst]++;
			
 
				+
			
 
				+	if (mpi_recvs_used[dst] > mpi_recvs_list_size[dst])
			
 
				+	{
			
 
				+		if (mpi_recvs_list_size[dst] > 0)
			
 
				+		{
			
 
				+			mpi_recvs_list_size[dst] *= 2;
			
 
				+		}
			
 
				+		else {
			
 
				+			mpi_recvs_list_size[dst] = 1;
			
 
				+		}
			
 
				+
			
 
				+		mpi_recvs[dst] = realloc(mpi_recvs[dst], mpi_recvs_list_size[dst]*sizeof(struct mpi_transfer));
			
 
				+	}
			
 
				+
			
 
				+	mpi_recvs[dst][slot].matched = 0;
			
 
				+	mpi_recvs[dst][slot].other_rank = dst;
			
 
				+	mpi_recvs[dst][slot].mpi_tag = mpi_tag;
			
 
				+	mpi_recvs[dst][slot].date = date;
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag)
			
 
				+{
			
 
				+	unsigned slot;
			
 
				+	unsigned firstslot = mpi_recvs_matched[dst];
			
 
				+
			
 
				+	unsigned all_previous_were_matched = 1;
			
 
				+
			
 
				+	for (slot = firstslot; slot < mpi_recvs_used[dst]; slot++)
			
 
				+	{
			
 
				+		if (!mpi_recvs[dst][slot].matched)
			
 
				+		{
			
 
				+			if (mpi_recvs[dst][slot].mpi_tag == mpi_tag)
			
 
				+			{
			
 
				+				/* we found a match ! */
			
 
				+				mpi_recvs[dst][slot].matched = 1;
			
 
				+				return &mpi_recvs[dst][slot];
			
 
				+			}
			
 
				+
			
 
				+			all_previous_were_matched = 0;
			
 
				+		}
			
 
				+		else {
			
 
				+			if (all_previous_were_matched)
			
 
				+			{
			
 
				+				/* All previous transfers are already matched,
			
 
				+				 * we need not consider them anymore */
			
 
				+				mpi_recvs_matched[dst] = slot;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* If we reached that point, we could not find a match */
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static unsigned long mpi_com_id = 0;
			
 
				+
			
 
				+static void display_all_transfers_from_trace(FILE *out_paje_file, int src)
			
 
				+{
			
 
				+	unsigned slot;
			
 
				+	for (slot = 0; slot < mpi_sends_used[src]; slot++)
			
 
				+	{
			
 
				+		int dst = mpi_sends[src][slot].other_rank;
			
 
				+		int mpi_tag = mpi_sends[src][slot].mpi_tag;
			
 
				+		float start_date = mpi_sends[src][slot].date;
			
 
				+		size_t size = mpi_sends[src][slot].size;
			
 
				+
			
 
				+		struct mpi_transfer *match;
			
 
				+		match = try_to_match_send_transfer(src, dst, mpi_tag);
			
 
				+
			
 
				+		if (match)
			
 
				+		{
			
 
				+			float end_date = match->date;
			
 
				+
			
 
				+			unsigned long id = mpi_com_id++;
			
 
				+			/* TODO replace 0 by a MPI program ? */
			
 
				+			if (out_paje_file)
			
 
				+			{
			
 
				+				fprintf(out_paje_file, "18	%f	MPIL	MPIroot   %ld	mpi_%d_p	mpicom_%lu\n", start_date, size, /* XXX */src, id);
			
 
				+				fprintf(out_paje_file, "19	%f	MPIL	MPIroot	  %ld	mpi_%d_p	mpicom_%lu\n", end_date, size, /* XXX */dst, id);
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			fprintf(stderr, "Warning, could not match MPI transfer from %d to %d (tag %x) starting at %f\n",
			
 
				+												src, dst, mpi_tag, start_date);
			
 
				+		}
			
 
				+
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks, FILE *out_paje_file)
			
 
				+{
			
 
				+	unsigned inputfile;
			
 
				+
			
 
				+	/* display the MPI transfers if possible */
			
 
				+	for (inputfile = 0; inputfile < options->ninputfiles; inputfile++)
			
 
				+	{
			
 
				+		int filerank = ranks[inputfile];
			
 
				+		display_all_transfers_from_trace(out_paje_file, filerank);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#endif // STARPU_USE_FXT
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -0,0 +1,157 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "starpu_fxt.h"
			
 
				+#include <common/config.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+
			
 
				+void starpu_fxt_write_paje_header(FILE *file)
			
 
				+{
			
 
				+	fprintf(file, "%%EventDef	PajeDefineContainerType	1\n");
			
 
				+	fprintf(file, "%%	Alias	string\n");
			
 
				+	fprintf(file, "%%	ContainerType	string\n");
			
 
				+	fprintf(file, "%%	Name	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeDefineEventType	2\n");
			
 
				+	fprintf(file, "%%	Alias	string\n");
			
 
				+	fprintf(file, "%%	ContainerType	string\n");
			
 
				+	fprintf(file, "%%	Name	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeDefineStateType	3\n");
			
 
				+	fprintf(file, "%%	Alias	string\n");
			
 
				+	fprintf(file, "%%	ContainerType	string\n");
			
 
				+	fprintf(file, "%%	Name	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeDefineVariableType	4\n");
			
 
				+	fprintf(file, "%%	Alias	string\n");
			
 
				+	fprintf(file, "%%	ContainerType	string\n");
			
 
				+	fprintf(file, "%%	Name	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeDefineLinkType	5\n");
			
 
				+	fprintf(file, "%%	Alias	string\n");
			
 
				+	fprintf(file, "%%	ContainerType	string\n");
			
 
				+	fprintf(file, "%%	SourceContainerType	string\n");
			
 
				+	fprintf(file, "%%	DestContainerType	string\n");
			
 
				+	fprintf(file, "%%	Name	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeDefineEntityValue	6\n");
			
 
				+	fprintf(file, "%%	Alias	string\n");
			
 
				+	fprintf(file, "%%	EntityType	string\n");
			
 
				+	fprintf(file, "%%	Name	string\n");
			
 
				+	fprintf(file, "%%	Color	color\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeCreateContainer	7\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Alias	string\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Name	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeDestroyContainer	8\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Name	string\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeNewEvent	9\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Value	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef PajeSetState 10\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Value	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajePushState	11\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Value	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajePopState	12\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeSetVariable	13\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Value	double\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeAddVariable	14\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Value	double\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeSubVariable	15\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Value	double\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeStartLink	18\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Value	string\n");
			
 
				+	fprintf(file, "%%	SourceContainer	string\n");
			
 
				+	fprintf(file, "%%	Key	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+	fprintf(file, "%%EventDef	PajeEndLink	19\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Value	string\n");
			
 
				+	fprintf(file, "%%	DestContainer	string\n");
			
 
				+	fprintf(file, "%%	Key	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+
			
 
				+	fprintf(file, "                                        \n \
			
 
				+	1       MPIP      0       \"MPI Program\"                      	\n \
			
 
				+	1       P      MPIP       \"Program\"                      	\n \
			
 
				+	1       Mn      P       \"Memory Node\"                         \n \
			
 
				+	1       T      Mn       \"Worker\"                               \n \
			
 
				+	1       Sc       P       \"Scheduler State\"                        \n \
			
 
				+	2       event   T       \"event type\"				\n \
			
 
				+	3       S       T       \"Thread State\"                        \n \
			
 
				+	3       MS       Mn       \"Memory Node State\"                        \n \
			
 
				+	4       ntask    Sc       \"Number of tasks\"                        \n \
			
 
				+	4       bw      Mn       \"Bandwidth\"                        \n \
			
 
				+	6       I       S      Initializing       \"0.0 .7 1.0\"            \n \
			
 
				+	6       D       S      Deinitializing       \"0.0 .1 .7\"            \n \
			
 
				+	6       Fi       S      FetchingInput       \"1.0 .1 1.0\"            \n \
			
 
				+	6       Po       S      PushingOutput       \"0.1 1.0 1.0\"            \n \
			
 
				+	6       C       S       Callback       \".0 .3 .8\"            \n \
			
 
				+	6       B       S       Blocked         \".9 .1 .0\"		\n \
			
 
				+	6       Sl       S      Sleeping         \".9 .1 .0\"		\n \
			
 
				+	6       P       S       Progressing         \".4 .1 .6\"		\n \
			
 
				+	6       A       MS      Allocating         \".4 .1 .0\"		\n \
			
 
				+	6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n \
			
 
				+	6       R       MS      Reclaiming         \".0 .1 .4\"		\n \
			
 
				+	6       Co       MS     DriverCopy         \".3 .5 .1\"		\n \
			
 
				+	6       No       MS     Nothing         \".0 .0 .0\"		\n \
			
 
				+	5       MPIL     MPIP	P	P      MPIL\n \
			
 
				+	5       L       P	Mn	Mn      L\n");
			
 
				+
			
 
				+	fprintf(file, "7      0.0 MPIroot      MPIP      0       root\n");
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,8 +18,6 @@
 
				 
			
 
				 #include <math.h>
			
 
				 #include <starpu.h>
			
 
				-#include <starpu_profiling.h>
			
 
				-#include <profiling/profiling.h>
			
 
				 #include <drivers/driver_common/driver_common.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/debug.h>
			
@@ -40,9 +38,6 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 	STARPU_ASSERT(cl);
			
 
				 	STARPU_ASSERT(cl->cpu_func);
			
 
				 
			
 
				-	if (cl->model && cl->model->benchmarking)
			
 
				-		calibrate_model = 1;
			
 
				-
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				 		ret = _starpu_fetch_task_input(task, 0);
			
@@ -50,7 +45,6 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 		{
			
 
				 			/* there was not enough memory so the codelet cannot be executed right now ... */
			
 
				 			/* push the codelet back and try another one ... */
			
 
				-			STARPU_ASSERT(ret == 0);
			
 
				 			return -EAGAIN;
			
 
				 		}
			
 
				 	}
			
@@ -58,52 +52,27 @@ static int execute_job_on_cpu(starpu_job_t j, struct starpu_worker_s *cpu_args,
 
				 	if (is_parallel_task)
			
 
				 		PTHREAD_BARRIER_WAIT(&j->before_work_barrier);
			
 
				 
			
 
				-	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				-
			
 
				-	struct starpu_task_profiling_info *profiling_info;
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				-
			
 
				-	if (rank == 0)
			
 
				-	{
			
 
				-		profiling_info = task->profiling_info;
			
 
				-	
			
 
				-		if ((profiling && profiling_info) || calibrate_model)
			
 
				-		{
			
 
				-			starpu_clock_gettime(&codelet_start);
			
 
				-			_starpu_worker_register_executing_start_date(workerid, &codelet_start);
			
 
				-		}
			
 
				+	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank);
			
 
				 
			
 
				-	}
			
 
				-	
			
 
				-	cpu_args->status = STATUS_EXECUTING;
			
 
				-	task->status = STARPU_TASK_RUNNING;	
			
 
				-	
			
 
				 	/* In case this is a Fork-join parallel task, the worker does not
			
 
				 	 * execute the kernel at all. */
			
 
				 	if ((rank == 0) || (cl->type != STARPU_FORKJOIN))
			
 
				 	{
			
 
				 		cl_func func = cl->cpu_func;
			
 
				-		func(task->interface, task->cl_arg);
			
 
				+		STARPU_ASSERT(func);
			
 
				+		func(task->interfaces, task->cl_arg);
			
 
				 	}
			
 
				-	
			
 
				-	if (is_parallel_task)
			
 
				-		PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
 
				 
			
 
				-	STARPU_TRACE_END_CODELET_BODY(j);
			
 
				+	_starpu_driver_end_job(cpu_args, j, &codelet_end, rank);
			
 
				 
			
 
				-	cpu_args->status = STATUS_UNKNOWN;
			
 
				+	if (is_parallel_task)
			
 
				+		PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
 
				 
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				-		cl->per_worker_stats[workerid]++;
			
 
				-		
			
 
				-		if ((profiling && profiling_info) || calibrate_model)
			
 
				-			starpu_clock_gettime(&codelet_end);
			
 
				-
			
 
				-		_starpu_push_task_output(task, 0);
			
 
				-
			
 
				-		_starpu_driver_update_job_feedback(j, cpu_args, profiling_info,
			
 
				+		_starpu_driver_update_job_feedback(j, cpu_args,
			
 
				 				perf_arch, &codelet_start, &codelet_end);
			
 
				+		_starpu_push_task_output(task, 0);
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -163,17 +132,6 @@ void *_starpu_cpu_worker(void *arg)
 
				 		{
			
 
				 			PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 			if (_starpu_worker_can_block(memnode)){
			
 
				-/* 			struct starpu_sched_ctx **sched_ctx = cpu_arg->sched_ctx; */
			
 
				-/* 			int i = 0; */
			
 
				-/* 			int sleep = 0; */
			
 
				-/* 			for(i = 0; i < cpu_arg->nctxs; i++){ */
			
 
				-/* 			  if(sched_ctx[i]->sched_ctx_id  == 2 ){ */
			
 
				-/* 			    sleep = 1; */
			
 
				-/* 			    break; */
			
 
				-/* 			  } */
			
 
				-/* 			} */
			
 
				-
			
 
				-/* 			if(sleep) */
			
 
				 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
			
 
				 			}
			
 
				 
			
@@ -228,7 +186,7 @@ void *_starpu_cpu_worker(void *arg)
 
				 
			
 
				 		struct starpu_sched_ctx *local_sched_ctx = _starpu_get_sched_ctx(j->task->sched_ctx);
			
 
				 
			
 
				-                res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
			
 
				+        res = execute_job_on_cpu(j, cpu_arg, is_parallel_task, rank, perf_arch);
			
 
				 
			
 
				 		_starpu_set_current_task(NULL);
			
 
				 
			
@@ -259,4 +217,5 @@ void *_starpu_cpu_worker(void *arg)
 
				 	STARPU_TRACE_WORKER_DEINIT_END(STARPU_FUT_CPU_KEY);
			
 
				 
			
 
				 	pthread_exit(NULL);
			
 
				+	return NULL;
			
 
				 }
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,7 +18,6 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_cuda.h>
			
 
				-#include <starpu_profiling.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <common/config.h>
			
 
				 #include <core/debug.h>
			
@@ -26,7 +25,6 @@
 
				 #include "driver_cuda.h"
			
 
				 #include <core/sched_policy.h>
			
 
				 #include <core/sched_ctx.h>
			
 
				-#include <profiling/profiling.h>
			
 
				 
			
 
				 /* the number of CUDA devices */
			
 
				 static int ncudagpus;
			
@@ -109,9 +107,7 @@ static void init_context(int devid)
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* force CUDA to initialize the context for real */
			
 
				-	cures = cudaFree(0);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	cudaFree(0);
			
 
				 
			
 
				 	limit_gpu_mem_if_needed(devid);
			
 
				 
			
@@ -147,7 +143,11 @@ unsigned _starpu_get_cuda_device_count(void)
 
				 	cures = cudaGetDeviceCount(&cnt);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		 return 0;
			
 
				-	
			
 
				+
			
 
				+	if (cnt > STARPU_MAXCUDADEVS) {
			
 
				+		fprintf(stderr, "# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
			
 
				+		cnt = STARPU_MAXCUDADEVS;
			
 
				+	}
			
 
				 	return (unsigned)cnt;
			
 
				 }
			
 
				 
			
@@ -161,6 +161,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 {
			
 
				 	int ret;
			
 
				 	uint32_t mask = 0;
			
 
				+	cudaError_t cures;
			
 
				 
			
 
				 	STARPU_ASSERT(j);
			
 
				 	struct starpu_task *task = j->task;
			
@@ -178,9 +179,8 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 		calibrate_model = 1;
			
 
				 
			
 
				 	ret = _starpu_fetch_task_input(task, mask);
			
 
				-
			
 
				 	if (ret != 0) {
			
 
				-		/* there was not enough memory, so th input of
			
 
				+		/* there was not enough memory, so the input of
			
 
				 		 * the codelet cannot be fetched ... put the 
			
 
				 		 * codelet back, and try it later */
			
 
				 		return -EAGAIN;
			
@@ -188,44 +188,28 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 
			
 
				 	if (calibrate_model)
			
 
				 	{
			
 
				-		cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
			
 
				+		cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				+	_starpu_driver_start_job(args, j, &codelet_start, 0);
			
 
				 
			
 
				-	struct starpu_task_profiling_info *profiling_info;
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				-	profiling_info = task->profiling_info;
			
 
				-
			
 
				-	if ((profiling && profiling_info) || calibrate_model)
			
 
				-	{
			
 
				-		starpu_clock_gettime(&codelet_start);
			
 
				-		_starpu_worker_register_executing_start_date(workerid, &codelet_start);
			
 
				-	}
			
 
				-
			
 
				-	args->status = STATUS_EXECUTING;
			
 
				-	task->status = STARPU_TASK_RUNNING;	
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	/* We make sure we do manipulate the proper device */
			
 
				+	cures = cudaSetDevice(args->devid);
			
 
				+#endif
			
 
				 
			
 
				 	cl_func func = cl->cuda_func;
			
 
				 	STARPU_ASSERT(func);
			
 
				-	func(task->interface, task->cl_arg);
			
 
				-
			
 
				-	cl->per_worker_stats[workerid]++;
			
 
				+	func(task->interfaces, task->cl_arg);
			
 
				 
			
 
				+	_starpu_driver_end_job(args, j, &codelet_end, 0);
			
 
				 
			
 
				-	if ((profiling && profiling_info) || calibrate_model)
			
 
				-		starpu_clock_gettime(&codelet_end);
			
 
				-
			
 
				-	STARPU_TRACE_END_CODELET_BODY(j);	
			
 
				-	args->status = STATUS_UNKNOWN;
			
 
				+	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end);
			
 
				 
			
 
				 	_starpu_push_task_output(task, mask);
			
 
				 
			
 
				-	_starpu_driver_update_job_feedback(j, args, profiling_info, args->perf_arch,
			
 
				-			&codelet_start, &codelet_end);
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -260,8 +244,11 @@ void *_starpu_cuda_worker(void *arg)
 
				 	struct cudaDeviceProp prop;
			
 
				 	cudaGetDeviceProperties(&prop, devid);
			
 
				 	strncpy(devname, prop.name, 128);
			
 
				-	snprintf(args->name, 32, "CUDA %d (%s)", args->devid, devname);
			
 
				-
			
 
				+#if CUDA_VERSION >= 3020
			
 
				+	snprintf(args->name, 48, "CUDA %d (%s %02x:%02x.0)", args->devid, devname, prop.pciBusID, prop.pciDeviceID);
			
 
				+#else
			
 
				+	snprintf(args->name, 48, "CUDA %d (%s)", args->devid, devname);
			
 
				+#endif
			
 
				 	_STARPU_DEBUG("cuda (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
			
 
				 
			
 
				 	STARPU_TRACE_WORKER_INIT_END
			
@@ -277,7 +264,7 @@ void *_starpu_cuda_worker(void *arg)
 
				 	int res;
			
 
				 
			
 
				 	pthread_cond_t *sched_cond = args->sched_cond;
			
 
				-        pthread_mutex_t *sched_mutex = args->sched_mutex;
			
 
				+    pthread_mutex_t *sched_mutex = args->sched_mutex;
			
 
				 
			
 
				 	while (_starpu_machine_is_running())
			
 
				 	{
			
@@ -287,7 +274,7 @@ void *_starpu_cuda_worker(void *arg)
 
				 
			
 
				 		task = _starpu_pop_task(args);
			
 
				 
			
 
				-                if (!task) 
			
 
				+        if (!task) 
			
 
				 		{
			
 
				 			PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 			if (_starpu_worker_can_block(memnode))
			
@@ -369,3 +356,43 @@ void *_starpu_cuda_worker(void *arg)
 
				 	return NULL;
			
 
				 
			
 
				 }
			
 
				+
			
 
				+void starpu_cublas_report_error(const char *func, cublasStatus status)
			
 
				+{
			
 
				+	char *errormsg;
			
 
				+	switch (status) {
			
 
				+		case CUBLAS_STATUS_SUCCESS:
			
 
				+			errormsg = "success";
			
 
				+			break;
			
 
				+		case CUBLAS_STATUS_NOT_INITIALIZED:
			
 
				+			errormsg = "not initialized";
			
 
				+			break;
			
 
				+		case CUBLAS_STATUS_ALLOC_FAILED:
			
 
				+			errormsg = "alloc failed";
			
 
				+			break;
			
 
				+		case CUBLAS_STATUS_INVALID_VALUE:
			
 
				+			errormsg = "invalid value";
			
 
				+			break;
			
 
				+		case CUBLAS_STATUS_ARCH_MISMATCH:
			
 
				+			errormsg = "arch mismatch";
			
 
				+			break;
			
 
				+		case CUBLAS_STATUS_EXECUTION_FAILED:
			
 
				+			errormsg = "execution failed";
			
 
				+			break;
			
 
				+		case CUBLAS_STATUS_INTERNAL_ERROR:
			
 
				+			errormsg = "internal error";
			
 
				+			break;
			
 
				+		default:
			
 
				+			errormsg = "unknown error";
			
 
				+			break;
			
 
				+	}
			
 
				+	printf("oops in %s ... %s \n", func, errormsg);
			
 
				+	assert(0);
			
 
				+}
			
 
				+
			
 
				+void starpu_cuda_report_error(const char *func, cudaError_t status)
			
 
				+{
			
 
				+	const char *errormsg = cudaGetErrorString(status);
			
 
				+	printf("oops in %s ... %s \n", func, errormsg);
			
 
				+	assert(0);
			
 
				+}
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -22,12 +22,73 @@
 
				 #include <common/utils.h>
			
 
				 #include <core/debug.h>
			
 
				 #include <drivers/driver_common/driver_common.h>
			
 
				+#include <starpu_top.h>
			
 
				 
			
 
				+void _starpu_driver_start_job(struct starpu_worker_s *args, starpu_job_t j, struct timespec *codelet_start, int rank)
			
 
				+{
			
 
				+	struct starpu_task *task = j->task;
			
 
				+	struct starpu_codelet_t *cl = task->cl;
			
 
				+	struct starpu_task_profiling_info *profiling_info;
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				+	int starpu_top=starpu_top_status_get();
			
 
				+	int workerid = args->workerid;
			
 
				+	unsigned calibrate_model = 0;
			
 
				+
			
 
				+	if (cl->model && cl->model->benchmarking)
			
 
				+		calibrate_model = 1;
			
 
				+
			
 
				+	args->status = STATUS_EXECUTING;
			
 
				+	task->status = STARPU_TASK_RUNNING;	
			
 
				+
			
 
				+	if (rank == 0) {
			
 
				+		cl->per_worker_stats[workerid]++;
			
 
				+
			
 
				+		profiling_info = task->profiling_info;
			
 
				+	
			
 
				+		if ((profiling && profiling_info) || calibrate_model || starpu_top)
			
 
				+		{
			
 
				+			starpu_clock_gettime(codelet_start);
			
 
				+			_starpu_worker_register_executing_start_date(workerid, codelet_start);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (starpu_top)
			
 
				+		starputop_task_started(task,workerid,codelet_start);
			
 
				+
			
 
				+	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				+}
			
 
				+
			
 
				+void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j, struct timespec *codelet_end, int rank)
			
 
				+{
			
 
				+	struct starpu_task *task = j->task;
			
 
				+	struct starpu_codelet_t *cl = task->cl;
			
 
				+	struct starpu_task_profiling_info *profiling_info = task->profiling_info;
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				+	int starpu_top=starpu_top_status_get();
			
 
				+	int workerid = args->workerid;
			
 
				+	unsigned calibrate_model = 0;
			
 
				+	enum starpu_perf_archtype archtype STARPU_ATTRIBUTE_UNUSED = args->perf_arch;
			
 
				+
			
 
				+	STARPU_TRACE_END_CODELET_BODY(j, archtype);
			
 
				+
			
 
				+	if (cl->model && cl->model->benchmarking)
			
 
				+		calibrate_model = 1;
			
 
				+
			
 
				+	if (rank == 0) {
			
 
				+		if ((profiling && profiling_info) || calibrate_model || starpu_top)
			
 
				+			starpu_clock_gettime(codelet_end);
			
 
				+	}
			
 
				+
			
 
				+	if (starpu_top)
			
 
				+	  starputop_task_ended(task,workerid,codelet_end);
			
 
				+
			
 
				+	args->status = STATUS_UNKNOWN;
			
 
				+}
			
 
				 void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
			
 
				-					struct starpu_task_profiling_info *profiling_info,
			
 
				 					enum starpu_perf_archtype perf_arch,
			
 
				 					struct timespec *codelet_start, struct timespec *codelet_end)
			
 
				 {
			
 
				+	struct starpu_task_profiling_info *profiling_info = j->task->profiling_info;
			
 
				 	struct timespec measured_ts;
			
 
				 	double measured;
			
 
				 	int workerid = worker_args->workerid;
			
@@ -36,7 +97,7 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 	int updated = 0;
			
 
				 
			
 
				-	if (cl->model && cl->model->benchmarking)
			
 
				+	if (cl->model && _starpu_get_calibrate_flag())
			
 
				 		calibrate_model = 1;
			
 
				 
			
 
				 	if (profiling_info || calibrate_model)
			
@@ -61,8 +122,9 @@ void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *
 
				 		if (calibrate_model)
			
 
				 			_starpu_update_perfmodel_history(j, j->task->cl->model,  perf_arch, worker_args->devid, measured);
			
 
				 	}
			
 
				+
			
 
				 	if (!updated)
			
 
				-		_starpu_worker_update_profiling_info_executing(workerid, 0, 1, 0, 0, 0);
			
 
				+		_starpu_worker_update_profiling_info_executing(workerid, NULL, 1, 0, 0, 0);
			
 
				 
			
 
				 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking) {
			
 
				 		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed);
			
--- a/src/drivers/driver_common/driver_common.h
+++ b/src/drivers/driver_common/driver_common.h
@@ -20,13 +20,14 @@
 
				 
			
 
				 #include <sys/time.h>
			
 
				 #include <starpu.h>
			
 
				-#include <starpu_profiling.h>
			
 
				 #include <core/jobs.h>
			
 
				-#include <profiling/profiling.h>
			
 
				 #include <common/utils.h>
			
 
				 
			
 
				+void _starpu_driver_start_job(struct starpu_worker_s *args, starpu_job_t j,
			
 
				+		struct timespec *codelet_start, int rank);
			
 
				+void _starpu_driver_end_job(struct starpu_worker_s *args, starpu_job_t j,
			
 
				+		struct timespec *codelet_end, int rank);
			
 
				 void _starpu_driver_update_job_feedback(starpu_job_t j, struct starpu_worker_s *worker_args,
			
 
				-		struct starpu_task_profiling_info *profiling_info,
			
 
				 		enum starpu_perf_archtype perf_arch,
			
 
				 		struct timespec *codelet_start, struct timespec *codelet_end);
			
 
				 
			
--- a/src/drivers/gordon/driver_gordon.c
+++ b/src/drivers/gordon/driver_gordon.c
@@ -211,7 +211,7 @@ static void gordon_callback_list_func(void *arg)
 
				 		}
			
 
				 
			
 
				 		_starpu_push_task_output(j->task, 0);
			
 
				-		_starpu_handle_job_termination(j, 0, worker->sched_ctx);
			
 
				+		_starpu_handle_job_termination(j, 0);
			
 
				 		//starpu_wake_all_blocked_workers();
			
 
				 
			
 
				 		task_cnt++;
			
@@ -337,7 +337,9 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 
				 		else {
			
 
				 #ifndef NOCHAIN
			
 
				 			int ret = 0;
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning we should look into the local job list here !
			
 
				+#endif
			
 
				 
			
 
				 			struct starpu_job_list_s *list = _starpu_pop_every_task();
			
 
				 			/* XXX 0 is hardcoded */
			
@@ -390,7 +392,7 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 
				 #else
			
 
				 			/* gordon should accept a little more work */
			
 
				 			starpu_job_t j;
			
 
				-			j =  _starpu_pop_task(arg->current_sched_ctx);
			
 
				+			j =  _starpu_pop_task(arg);
			
 
				 	//		_STARPU_DEBUG("pop task %p\n", j);
			
 
				 			if (j) {
			
 
				 				if (STARPU_GORDON_MAY_PERFORM(j)) {
			
@@ -399,7 +401,7 @@ void *gordon_worker_inject(struct starpu_worker_set_s *arg)
 
				 					inject_task(j, &arg->workers[0]);
			
 
				 				}
			
 
				 				else {
			
 
				-				  _starpu_push_task(j, 0, arg->current_sched_ctx);
			
 
				+					_starpu_push_task(j, 0);
			
 
				 				}
			
 
				 			}
			
 
				 #endif
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,7 +18,6 @@
 
				 
			
 
				 #include <math.h>
			
 
				 #include <starpu.h>
			
 
				-#include <starpu_profiling.h>
			
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/debug.h>
			
@@ -27,13 +26,13 @@
 
				 #include "driver_opencl.h"
			
 
				 #include "driver_opencl_utils.h"
			
 
				 #include <common/utils.h>
			
 
				-#include <profiling/profiling.h>
			
 
				 
			
 
				 static pthread_mutex_t big_lock = PTHREAD_MUTEX_INITIALIZER;
			
 
				 
			
 
				 static cl_context contexts[STARPU_MAXOPENCLDEVS];
			
 
				 static cl_device_id devices[STARPU_MAXOPENCLDEVS];
			
 
				 static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
			
 
				+static cl_command_queue transfer_queues[STARPU_MAXOPENCLDEVS];
			
 
				 static cl_uint nb_devices = -1;
			
 
				 static int init_done = 0;
			
 
				 extern char *_starpu_opencl_program_dir;
			
@@ -122,9 +121,17 @@ cl_int _starpu_opencl_init_context(int devid)
 
				         contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-        // Create queue for the given device
			
 
				+        // Create execution queue for the given device
			
 
				         queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+        // Create transfer queue for the given device
			
 
				+        cl_command_queue_properties props;
			
 
				+        clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
			
 
				+        props &= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
			
 
				+        transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				 	PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				 
			
 
				 	limit_gpu_mem_if_needed(devid);
			
@@ -148,6 +155,9 @@ cl_int _starpu_opencl_deinit_context(int devid)
 
				         err = clReleaseCommandQueue(queues[devid]);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				+        err = clReleaseCommandQueue(transfer_queues[devid]);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				         contexts[devid] = NULL;
			
 
				 
			
 
				 	PTHREAD_MUTEX_UNLOCK(&big_lock);
			
@@ -176,7 +186,7 @@ cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, si
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				-        err = clEnqueueWriteBuffer(queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				         if (STARPU_LIKELY(err == CL_SUCCESS)) {
			
 
				                 *ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				                 return CL_SUCCESS;
			
@@ -184,7 +194,7 @@ cl_int _starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, cl_mem buffer, si
 
				         else {
			
 
				                 if (event != NULL) {
			
 
				                         /* The asynchronous copy has failed, try to copy synchronously */
			
 
				-                        err = clEnqueueWriteBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				+                        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				                 }
			
 
				                 if (STARPU_LIKELY(err == CL_SUCCESS)) {
			
 
				                         *ret = 0;
			
@@ -204,7 +214,7 @@ cl_int _starpu_opencl_copy_ram_to_opencl(void *ptr, cl_mem buffer, size_t size,
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				-        err = clEnqueueWriteBuffer(queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+        err = clEnqueueWriteBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				         return CL_SUCCESS;
			
@@ -217,7 +227,7 @@ cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, si
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				-        err = clEnqueueReadBuffer(queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				         if (STARPU_LIKELY(err == CL_SUCCESS)) {
			
 
				                 *ret = (event == NULL) ? 0 : -EAGAIN;
			
 
				                 return CL_SUCCESS;
			
@@ -225,7 +235,7 @@ cl_int _starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, void *ptr, si
 
				         else {
			
 
				                 if (event != NULL)
			
 
				                         /* The asynchronous copy has failed, try to copy synchronously */
			
 
				-                        err = clEnqueueReadBuffer(queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				+                        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, CL_TRUE, offset, size, ptr, 0, NULL, NULL);
			
 
				                 if (STARPU_LIKELY(err == CL_SUCCESS)) {
			
 
				                         *ret = 0;
			
 
				                         return CL_SUCCESS;
			
@@ -246,7 +256,7 @@ cl_int _starpu_opencl_copy_opencl_to_ram(cl_mem buffer, void *ptr, size_t size,
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				-        err = clEnqueueReadBuffer(queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				+        err = clEnqueueReadBuffer(transfer_queues[worker->devid], buffer, blocking, offset, size, ptr, 0, NULL, event);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				         return CL_SUCCESS;
			
@@ -262,7 +272,7 @@ cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, void *ptr, const si
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				-        err = clEnqueueReadBufferRect(queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
			
 
				+        err = clEnqueueReadBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
			
 
				                                       buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -278,7 +288,7 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, cl_mem buffer, const si
 
				         cl_bool blocking;
			
 
				 
			
 
				         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
			
 
				-        err = clEnqueueWriteBufferRect(queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
			
 
				+        err = clEnqueueWriteBufferRect(transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
			
 
				                                        buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
@@ -300,7 +310,7 @@ void _starpu_opencl_init(void)
 
				 
			
 
				                 // Get Platforms
			
 
				                 err = clGetPlatformIDs(STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
			
 
				-                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+                if (err != CL_SUCCESS) nb_platforms=0;
			
 
				                 _STARPU_DEBUG("Platforms detected: %d\n", nb_platforms);
			
 
				 
			
 
				                 // Get devices
			
@@ -308,28 +318,40 @@ void _starpu_opencl_init(void)
 
				                 {
			
 
				                         for (i=0; i<nb_platforms; i++) {
			
 
				                                 cl_uint num;
			
 
				-
			
 
				+				int platform_valid = 1;
			
 
				+				char name[1024], vendor[1024];
			
 
				+
			
 
				+				err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
			
 
				+				if (err != CL_SUCCESS) {
			
 
				+					STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo NAME", err);
			
 
				+					platform_valid = 0;
			
 
				+				}
			
 
				+				else {
			
 
				+					err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
			
 
				+					if (err != CL_SUCCESS) {
			
 
				+						STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo VENDOR", err);
			
 
				+						platform_valid = 0;
			
 
				+					}
			
 
				+				}
			
 
				 #ifdef STARPU_VERBOSE
			
 
				-                                {
			
 
				-                                        char name[1024], vendor[1024];
			
 
				-                                        err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_NAME, 1024, name, NULL);
			
 
				-                                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                                        err = clGetPlatformInfo(platform_id[i], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
			
 
				-                                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                                        _STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
			
 
				-                                }
			
 
				+				if (platform_valid)
			
 
				+					_STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
			
 
				+				else
			
 
				+					_STARPU_DEBUG("Platform invalid\n");
			
 
				 #endif
			
 
				-                                err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
			
 
				-                                if (err == CL_DEVICE_NOT_FOUND) {
			
 
				-                                        _STARPU_DEBUG("  No devices detected on this platform\n");
			
 
				-                                }
			
 
				-                                else {
			
 
				-                                        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                                        _STARPU_DEBUG("  %d devices detected\n", num);
			
 
				-                                        nb_devices += num;
			
 
				-                                }
			
 
				-                        }
			
 
				-                }
			
 
				+				if (platform_valid) {
			
 
				+					err = clGetDeviceIDs(platform_id[i], device_type, STARPU_MAXOPENCLDEVS-nb_devices, &devices[nb_devices], &num);
			
 
				+					if (err == CL_DEVICE_NOT_FOUND) {
			
 
				+						_STARPU_DEBUG("  No devices detected on this platform\n");
			
 
				+					}
			
 
				+					else {
			
 
				+						if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+						_STARPU_DEBUG("  %d devices detected\n", num);
			
 
				+						nb_devices += num;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				 
			
 
				                 // Get location of OpenCl kernel source files
			
 
				                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
			
@@ -338,6 +360,7 @@ void _starpu_opencl_init(void)
 
				                 for(i=0 ; i<nb_devices ; i++) {
			
 
				                         contexts[i] = NULL;
			
 
				                         queues[i] = NULL;
			
 
				+                        transfer_queues[i] = NULL;
			
 
				                 }
			
 
				 
			
 
				                 init_done=1;
			
@@ -404,7 +427,7 @@ void *_starpu_opencl_worker(void *arg)
 
				 
			
 
				 		task = _starpu_pop_task(args);
			
 
				 		
			
 
				-                if (task == NULL) 
			
 
				+        if (task == NULL) 
			
 
				 		{
			
 
				 			if (_starpu_worker_can_block(memnode))
			
 
				 				_starpu_block_worker(workerid, args->sched_cond, args->sched_mutex);
			
@@ -491,16 +514,11 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
				 
			
 
				 	struct timespec codelet_start, codelet_end;
			
 
				 
			
 
				-	unsigned calibrate_model = 0;
			
 
				 	int workerid = args->workerid;
			
 
				-
			
 
				 	STARPU_ASSERT(task);
			
 
				 	struct starpu_codelet_t *cl = task->cl;
			
 
				 	STARPU_ASSERT(cl);
			
 
				 
			
 
				-	if (cl->model && cl->model->benchmarking)
			
 
				-		calibrate_model = 1;
			
 
				-
			
 
				 	ret = _starpu_fetch_task_input(task, mask);
			
 
				 	if (ret != 0) {
			
 
				 		/* there was not enough memory, so the input of
			
@@ -509,37 +527,18 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
				 		return -EAGAIN;
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_START_CODELET_BODY(j);
			
 
				-
			
 
				-	struct starpu_task_profiling_info *profiling_info;
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				-	profiling_info = task->profiling_info;
			
 
				-
			
 
				-	if ((profiling && profiling_info) || calibrate_model)
			
 
				-	{
			
 
				-		starpu_clock_gettime(&codelet_start);
			
 
				-		_starpu_worker_register_executing_start_date(workerid, &codelet_start);
			
 
				-	}
			
 
				-
			
 
				-	args->status = STATUS_EXECUTING;
			
 
				-	task->status = STARPU_TASK_RUNNING;	
			
 
				+	_starpu_driver_start_job(args, j, &codelet_start, 0);
			
 
				 
			
 
				 	cl_func func = cl->opencl_func;
			
 
				 	STARPU_ASSERT(func);
			
 
				-	func(task->interface, task->cl_arg);
			
 
				+	func(task->interfaces, task->cl_arg);
			
 
				 
			
 
				-	cl->per_worker_stats[workerid]++;
			
 
				+	_starpu_driver_end_job(args, j, &codelet_end, 0);
			
 
				 
			
 
				-	if ((profiling && profiling_info) || calibrate_model)
			
 
				-		starpu_clock_gettime(&codelet_end);
			
 
				-
			
 
				-	STARPU_TRACE_END_CODELET_BODY(j);
			
 
				-	args->status = STATUS_UNKNOWN;
			
 
				+	_starpu_driver_update_job_feedback(j, args, args->perf_arch,
			
 
				+							&codelet_start, &codelet_end);
			
 
				 
			
 
				 	_starpu_push_task_output(task, mask);
			
 
				 
			
 
				-	_starpu_driver_update_job_feedback(j, args, profiling_info, args->perf_arch,
			
 
				-							&codelet_start, &codelet_end);
			
 
				-
			
 
				 	return EXIT_SUCCESS;
			
 
				 }
			
--- a/src/drivers/opencl/driver_opencl_utils.c
+++ b/src/drivers/opencl/driver_opencl_utils.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -29,6 +29,10 @@
 
				 #include "driver_opencl_utils.h"
			
 
				 #include "driver_opencl.h"
			
 
				 
			
 
				+#ifdef HAVE_CL_CL_EXT_H
			
 
				+#include <CL/cl_ext.h>
			
 
				+#endif
			
 
				+
			
 
				 char *_starpu_opencl_program_dir;
			
 
				 
			
 
				 #define _STARPU_STRINGIFY_(x) #x
			
@@ -121,7 +125,8 @@ char *_starpu_opencl_load_program_source(const char *filename)
 
				         return source;
			
 
				 }
			
 
				 
			
 
				-int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs)
			
 
				+int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs,
			
 
				+					  const char* build_options)
			
 
				 {
			
 
				         unsigned int dev;
			
 
				         unsigned int nb_devices;
			
@@ -150,7 +155,7 @@ int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, str
 
				                 if (!program || err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				                 // Build the program executable
			
 
				-                err = clBuildProgram(program, 1, &device, "-Werror -cl-mad-enable", NULL, NULL);
			
 
				+                err = clBuildProgram(program, 1, &device, build_options, NULL, NULL);
			
 
				                 if (err != CL_SUCCESS) {
			
 
				                         size_t len;
			
 
				                         static char buffer[4096];
			
@@ -168,10 +173,16 @@ int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, str
 
				         return EXIT_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs)
			
 
				+int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs,
			
 
				+					const char* build_options)
			
 
				 {
			
 
				+	int nb_devices;
			
 
				         char located_file_name[1024];
			
 
				 
			
 
				+	// Do not try to load and compile the file if there is no devices
			
 
				+	nb_devices = _starpu_opencl_get_device_count();
			
 
				+	if (nb_devices == 0) return EXIT_SUCCESS;
			
 
				+
			
 
				         // Locate source file
			
 
				         _starpu_opencl_locate_file(source_file_name, located_file_name);
			
 
				         _STARPU_DEBUG("Source file name : <%s>\n", located_file_name);
			
@@ -181,7 +192,7 @@ int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct sta
 
				         if(!opencl_program_source)
			
 
				                 _STARPU_ERROR("Failed to load compute program from file <%s>!\n", located_file_name);
			
 
				 
			
 
				-        return starpu_opencl_load_opencl_from_string(opencl_program_source, opencl_programs);
			
 
				+        return starpu_opencl_load_opencl_from_string(opencl_program_source, opencl_programs, build_options);
			
 
				 }
			
 
				 
			
 
				 cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
			
@@ -198,7 +209,7 @@ cl_int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs
 
				         return CL_SUCCESS;
			
 
				 }
			
 
				 
			
 
				-int starpu_opencl_collect_stats(cl_event event __attribute__((unused)))
			
 
				+int starpu_opencl_collect_stats(cl_event event STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #if defined(CL_PROFILING_CLOCK_CYCLE_COUNT)||defined(CL_PROFILING_STALL_CYCLE_COUNT)||defined(CL_PROFILING_POWER_CONSUMED)
			
 
				 	struct starpu_task *task = starpu_get_current_task();
			
@@ -243,3 +254,161 @@ int starpu_opencl_collect_stats(cl_event event __attribute__((unused)))
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+void starpu_opencl_display_error(const char *func, const char* msg, cl_int status)
			
 
				+{
			
 
				+	const char *errormsg;
			
 
				+	switch (status) {
			
 
				+	case CL_SUCCESS:
			
 
				+		errormsg = "success";
			
 
				+		break;
			
 
				+	case CL_DEVICE_NOT_FOUND:
			
 
				+		errormsg = "Device not found";
			
 
				+		break;
			
 
				+	case CL_DEVICE_NOT_AVAILABLE:
			
 
				+		errormsg = "Device not available";
			
 
				+		break;
			
 
				+	case CL_COMPILER_NOT_AVAILABLE:
			
 
				+		errormsg = "Compiler not available";
			
 
				+		break;
			
 
				+	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
			
 
				+		errormsg = "Memory object allocation failure";
			
 
				+		break;
			
 
				+	case CL_OUT_OF_RESOURCES:
			
 
				+		errormsg = "Out of resources";
			
 
				+		break;
			
 
				+	case CL_OUT_OF_HOST_MEMORY:
			
 
				+		errormsg = "Out of host memory";
			
 
				+		break;
			
 
				+	case CL_PROFILING_INFO_NOT_AVAILABLE:
			
 
				+		errormsg = "Profiling info not available";
			
 
				+		break;
			
 
				+	case CL_MEM_COPY_OVERLAP:
			
 
				+		errormsg = "Memory copy overlap";
			
 
				+		break;
			
 
				+	case CL_IMAGE_FORMAT_MISMATCH:
			
 
				+		errormsg = "Image format mismatch";
			
 
				+		break;
			
 
				+	case CL_IMAGE_FORMAT_NOT_SUPPORTED:
			
 
				+		errormsg = "Image format not supported";
			
 
				+		break;
			
 
				+	case CL_BUILD_PROGRAM_FAILURE:
			
 
				+		errormsg = "Build program failure";
			
 
				+		break;
			
 
				+	case CL_MAP_FAILURE:
			
 
				+		errormsg = "Map failure";
			
 
				+		break;
			
 
				+	case CL_INVALID_VALUE:
			
 
				+		errormsg = "Invalid value";
			
 
				+		break;
			
 
				+	case CL_INVALID_DEVICE_TYPE:
			
 
				+		errormsg = "Invalid device type";
			
 
				+		break;
			
 
				+	case CL_INVALID_PLATFORM:
			
 
				+		errormsg = "Invalid platform";
			
 
				+		break;
			
 
				+	case CL_INVALID_DEVICE:
			
 
				+		errormsg = "Invalid device";
			
 
				+		break;
			
 
				+	case CL_INVALID_CONTEXT:
			
 
				+		errormsg = "Invalid context";
			
 
				+		break;
			
 
				+	case CL_INVALID_QUEUE_PROPERTIES:
			
 
				+		errormsg = "Invalid queue properties";
			
 
				+		break;
			
 
				+	case CL_INVALID_COMMAND_QUEUE:
			
 
				+		errormsg = "Invalid command queue";
			
 
				+		break;
			
 
				+	case CL_INVALID_HOST_PTR:
			
 
				+		errormsg = "Invalid host pointer";
			
 
				+		break;
			
 
				+	case CL_INVALID_MEM_OBJECT:
			
 
				+		errormsg = "Invalid memory object";
			
 
				+		break;
			
 
				+	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
			
 
				+		errormsg = "Invalid image format descriptor";
			
 
				+		break;
			
 
				+	case CL_INVALID_IMAGE_SIZE:
			
 
				+		errormsg = "Invalid image size";
			
 
				+		break;
			
 
				+	case CL_INVALID_SAMPLER:
			
 
				+		errormsg = "Invalid sampler";
			
 
				+		break;
			
 
				+	case CL_INVALID_BINARY:
			
 
				+		errormsg = "Invalid binary";
			
 
				+		break;
			
 
				+	case CL_INVALID_BUILD_OPTIONS:
			
 
				+		errormsg = "Invalid build options";
			
 
				+		break;
			
 
				+	case CL_INVALID_PROGRAM:
			
 
				+		errormsg = "Invalid program";
			
 
				+		break;
			
 
				+	case CL_INVALID_PROGRAM_EXECUTABLE:
			
 
				+		errormsg = "Invalid program executable";
			
 
				+		break;
			
 
				+	case CL_INVALID_KERNEL_NAME:
			
 
				+		errormsg = "Invalid kernel name";
			
 
				+		break;
			
 
				+	case CL_INVALID_KERNEL_DEFINITION:
			
 
				+		errormsg = "Invalid kernel definition";
			
 
				+		break;
			
 
				+	case CL_INVALID_KERNEL:
			
 
				+		errormsg = "Invalid kernel";
			
 
				+		break;
			
 
				+	case CL_INVALID_ARG_INDEX:
			
 
				+		errormsg = "Invalid argument index";
			
 
				+		break;
			
 
				+	case CL_INVALID_ARG_VALUE:
			
 
				+		errormsg = "Invalid argument value";
			
 
				+		break;
			
 
				+	case CL_INVALID_ARG_SIZE:
			
 
				+		errormsg = "Invalid argument size";
			
 
				+		break;
			
 
				+	case CL_INVALID_KERNEL_ARGS:
			
 
				+		errormsg = "Invalid kernel arguments";
			
 
				+		break;
			
 
				+	case CL_INVALID_WORK_DIMENSION:
			
 
				+		errormsg = "Invalid work dimension";
			
 
				+		break;
			
 
				+	case CL_INVALID_WORK_GROUP_SIZE:
			
 
				+		errormsg = "Invalid work group size";
			
 
				+		break;
			
 
				+	case CL_INVALID_WORK_ITEM_SIZE:
			
 
				+		errormsg = "Invalid work item size";
			
 
				+		break;
			
 
				+	case CL_INVALID_GLOBAL_OFFSET:
			
 
				+		errormsg = "Invalid global offset";
			
 
				+		break;
			
 
				+	case CL_INVALID_EVENT_WAIT_LIST:
			
 
				+		errormsg = "Invalid event wait list";
			
 
				+		break;
			
 
				+	case CL_INVALID_EVENT:
			
 
				+		errormsg = "Invalid event";
			
 
				+		break;
			
 
				+	case CL_INVALID_OPERATION:
			
 
				+		errormsg = "Invalid operation";
			
 
				+		break;
			
 
				+	case CL_INVALID_GL_OBJECT:
			
 
				+		errormsg = "Invalid GL object";
			
 
				+		break;
			
 
				+	case CL_INVALID_BUFFER_SIZE:
			
 
				+		errormsg = "Invalid buffer size";
			
 
				+		break;
			
 
				+	case CL_INVALID_MIP_LEVEL:
			
 
				+		errormsg = "Invalid MIP level";
			
 
				+		break;
			
 
				+#ifdef CL_PLATFORM_NOT_FOUND_KHR
			
 
				+	case CL_PLATFORM_NOT_FOUND_KHR:
			
 
				+		errormsg = "Platform not found";
			
 
				+		break;
			
 
				+#endif
			
 
				+	default:
			
 
				+		errormsg = "unknown error";
			
 
				+		break;
			
 
				+	}
			
 
				+	if (msg)
			
 
				+		printf("oops in %s (%s) ... <%s> (%d) \n", func, msg, errormsg, status);
			
 
				+	else
			
 
				+		printf("oops in %s ... <%s> (%d) \n", func, errormsg, status);
			
 
				+
			
 
				+}
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -171,16 +171,13 @@ static void new_task(starpu_job_t j)
 
				 	if (j->bound_task)
			
 
				 		return;
			
 
				 
			
 
				-	if (STARPU_UNLIKELY(!j->footprint_is_computed))
			
 
				-		_starpu_compute_buffers_footprint(j);
			
 
				-
			
 
				 	t = malloc(sizeof(*t));
			
 
				 	memset(t, 0, sizeof(*t));
			
 
				 	t->id = j->job_id;
			
 
				 	t->tag_id = j->task->tag_id;
			
 
				 	t->use_tag = j->task->use_tag;
			
 
				 	t->cl = j->task->cl;
			
 
				-	t->footprint = j->footprint;
			
 
				+	t->footprint = _starpu_compute_buffers_footprint(j);
			
 
				 	t->priority = j->task->priority;
			
 
				 	t->deps = NULL;
			
 
				 	t->depsn = 0;
			
@@ -209,8 +206,7 @@ void _starpu_bound_record(starpu_job_t j)
 
				 	} else {
			
 
				 		struct bound_task_pool *tp;
			
 
				 
			
 
				-		if (STARPU_UNLIKELY(!j->footprint_is_computed))
			
 
				-			_starpu_compute_buffers_footprint(j);
			
 
				+		_starpu_compute_buffers_footprint(j);
			
 
				 
			
 
				 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
			
 
				 			tp = last;
			
@@ -756,7 +752,7 @@ static glp_prob *_starpu_bound_glp_resolve(int integer)
 
				 		for (w = 0; w < nw; w++)
			
 
				 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
			
 
				 				char name[32];
			
 
				-				snprintf(name, sizeof(name), "w%ut%un", w, t);
			
 
				+				snprintf(name, sizeof(name), "w%dt%dn", w, t);
			
 
				 				glp_set_col_name(lp, colnum(w, t), name);
			
 
				 				if (integer)
			
 
				 					glp_set_col_kind(lp, colnum(w, t), GLP_IV);
			
@@ -857,9 +853,9 @@ void starpu_bound_print(FILE *output, int integer __attribute__ ((unused))) {
 
				 			fprintf(output, "%s key %x\n", tp->cl->model->symbol, (unsigned) tp->footprint);
			
 
				 			for (w = 0; w < nw; w++)
			
 
				 				if (integer)
			
 
				-					fprintf(output, "\tw%ut%un %f", w, t, glp_mip_col_val(lp, colnum(w, t)));
			
 
				+					fprintf(output, "\tw%dt%dn %f", w, t, glp_mip_col_val(lp, colnum(w, t)));
			
 
				 				else
			
 
				-					fprintf(output, "\tw%ut%un %f", w, t, glp_get_col_prim(lp, colnum(w, t)));
			
 
				+					fprintf(output, "\tw%dt%dn %f", w, t, glp_get_col_prim(lp, colnum(w, t)));
			
 
				 			fprintf(output, "\n");
			
 
				 		}
			
 
				 
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -105,7 +105,7 @@ void _starpu_profiling_init(void)
 
				 		profiling = 1;
			
 
				 }
			
 
				 
			
 
				-void starpu_profiling_terminate(void)
			
 
				+void _starpu_profiling_terminate(void)
			
 
				 {
			
 
				 
			
 
				 }
			
@@ -236,7 +236,8 @@ void _starpu_worker_update_profiling_info_executing(int workerid, struct timespe
 
				 	{
			
 
				 		PTHREAD_MUTEX_LOCK(&worker_info_mutex[workerid]);
			
 
				 
			
 
				-		starpu_timespec_accumulate(&worker_info[workerid].executing_time, executing_time);
			
 
				+		if (executing_time)
			
 
				+			starpu_timespec_accumulate(&worker_info[workerid].executing_time, executing_time);
			
 
				 
			
 
				 		worker_info[workerid].used_cycles += used_cycles;
			
 
				 		worker_info[workerid].stall_cycles += stall_cycles;
			
--- a/src/profiling/profiling.h
+++ b/src/profiling/profiling.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -68,4 +68,6 @@ void _starpu_profiling_set_task_push_end_time(struct starpu_task *task);
 
				 /* This function needs to be called before other starpu_profile_* functions */
			
 
				 void _starpu_profiling_init(void);
			
 
				 
			
 
				+void _starpu_profiling_terminate(void);
			
 
				+
			
 
				 #endif // __PROFILING_H__
			
--- a/src/profiling/profiling_helpers.c
+++ b/src/profiling/profiling_helpers.c
@@ -55,6 +55,7 @@ void starpu_worker_profiling_helper_display_summary(void)
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 	fprintf(stderr, "\nWorker statistics:\n");
			
 
				 	fprintf(stderr,   "******************\n");
			
 
				+	double overall_time = 0;
			
 
				 
			
 
				 	int workerid;
			
 
				 	int worker_cnt = starpu_worker_get_count();
			
@@ -70,20 +71,32 @@ void starpu_worker_profiling_helper_display_summary(void)
 
				 			double total_time = starpu_timing_timespec_to_us(&info.total_time) / 1000.;
			
 
				 			double executing_time = starpu_timing_timespec_to_us(&info.executing_time) / 1000.;
			
 
				 			double sleeping_time = starpu_timing_timespec_to_us(&info.sleeping_time) / 1000.;
			
 
				+			if (total_time > overall_time)
			
 
				+				overall_time = total_time;
			
 
				 
			
 
				 			fprintf(stderr, "%-32s\n", name);
			
 
				 			fprintf(stderr, "\t%d task(s)\n\ttotal: %.2lf ms executing: %.2lf ms sleeping: %.2lf\n", info.executed_tasks, total_time, executing_time, sleeping_time);
			
 
				 			if (info.used_cycles || info.stall_cycles)
			
 
				 				fprintf(stderr, "\t%lu Mcy %lu Mcy stall\n", info.used_cycles/1000000, info.stall_cycles/1000000);
			
 
				 			if (info.power_consumed)
			
 
				-				fprintf(stderr, "\t%lf J consumed\n", info.power_consumed);
			
 
				+				fprintf(stderr, "\t%f J consumed\n", info.power_consumed);
			
 
				 		} else {
			
 
				-			fprintf(stderr, "\t%-32s\tapproximately %d task(s)\n", name, info.executed_tasks);
			
 
				+			fprintf(stderr, "\t%-32s\t%d task(s)\n", name, info.executed_tasks);
			
 
				 		}
			
 
				 
			
 
				 		sum_consumed += info.power_consumed;
			
 
				 	}
			
 
				 
			
 
				+	if (profiling) {
			
 
				+		const char *strval_idle_power = getenv("STARPU_IDLE_POWER");
			
 
				+		if (strval_idle_power) {
			
 
				+			double idle_power = atof(strval_idle_power); /* Watt */
			
 
				+			double idle_consumption = idle_power * overall_time / 1000.; /* J */
			
 
				+
			
 
				+			fprintf(stderr, "Idle consumption: %.2lf J\n", idle_consumption);
			
 
				+			sum_consumed += idle_consumption;
			
 
				+		}
			
 
				+	}
			
 
				 	if (profiling && sum_consumed)
			
 
				 		fprintf(stderr, "Total consumption: %.2lf J\n", sum_consumed);
			
 
				 }
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -214,6 +214,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
				 	return new_list;
			
 
				 }
			
 
				 
			
 
				+static
			
 
				 int _starpu_fifo_push_sorted_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				 {
			
 
				 	struct starpu_task_list *list = &fifo_queue->taskq;
			
@@ -301,17 +302,12 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	if (starpu_get_prefetch_flag())
			
 
				 		starpu_prefetch_task_input_on_node(task, memory_node);
			
 
				 
			
 
				-	switch (prio) {
			
 
				-		case 1:
			
 
				-			return _starpu_fifo_push_prio_task(dt->queue_array[best_workerid_ctx],
			
 
				-				sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
			
 
				-		case 2:
			
 
				-			return _starpu_fifo_push_sorted_task(dt->queue_array[best_workerid_ctx],
			
 
				-				sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
			
 
				-		default:
			
 
				-			return _starpu_fifo_push_task(dt->queue_array[best_workerid_ctx],
			
 
				-				sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
			
 
				-	}
			
 
				+	if (prio)
			
 
				+		return _starpu_fifo_push_sorted_task(dt->queue_array[best_workerid_ctx],
			
 
				+			sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
			
 
				+	else
			
 
				+		return _starpu_fifo_push_task(dt->queue_array[best_workerid_ctx],
			
 
				+			sched_ctx->sched_mutex[best_workerid_ctx], sched_ctx->sched_cond[best_workerid_ctx], task);
			
 
				 }
			
 
				 
			
 
				 static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_sched_ctx *sched_ctx)
			
@@ -335,7 +331,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, struct starpu_
 
				 	unsigned nworkers = sched_ctx->nworkers_in_ctx;
			
 
				 	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
			
 
				 	{
			
 
				-                worker = sched_ctx->workerid[worker_in_ctx];
			
 
				+        worker = sched_ctx->workerid[worker_in_ctx];
			
 
				 		double exp_end;
			
 
				 		
			
 
				 		fifo = dt->queue_array[worker_in_ctx];
			
@@ -421,7 +417,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 
			
 
				 	double best_exp_end = 10e240;
			
 
				 	double model_best = 0.0;
			
 
				-	double penality_best = 0.0;
			
 
				+	//double penality_best = 0.0;
			
 
				 
			
 
				 	int ntasks_best = -1;
			
 
				 	double ntasks_best_end = 0.0;
			
@@ -432,7 +428,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 
			
 
				 	for (worker_in_ctx = 0; worker_in_ctx < nworkers_in_ctx; worker_in_ctx++)
			
 
				 	{
			
 
				-                worker = sched_ctx->workerid[worker_in_ctx];
			
 
				+        worker = sched_ctx->workerid[worker_in_ctx];
			
 
				 
			
 
				 		fifo = dt->queue_array[worker_in_ctx];
			
 
				 
			
@@ -528,7 +524,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 				best = worker;
			
 
				 				best_in_ctx = worker_in_ctx;
			
 
				 
			
 
				-	//			_STARPU_DEBUG("best fitness (worker %d) %le = alpha*(%le) + beta(%le) +gamma(%le)\n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker], local_power[worker]);
			
 
				+	//			_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker] - best_exp_end, local_data_penalty[worker], local_power[worker]);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -542,12 +538,12 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 		 * so we force this measurement */
			
 
				 		best = forced_best;
			
 
				 		model_best = 0.0;
			
 
				-		penality_best = 0.0;
			
 
				+		//penality_best = 0.0;
			
 
				 	}
			
 
				 	else 
			
 
				 	{
			
 
				-		model_best = local_task_length[best_in_ctx];
			
 
				-		penality_best = local_data_penalty[best_in_ctx];
			
 
				+		model_best = local_task_length[best];
			
 
				+		//penality_best = local_data_penalty[best];
			
 
				 	}
			
 
				 
			
 
				 	/* we should now have the best worker in variable "best" */
			
@@ -560,12 +556,6 @@ static int dmda_push_sorted_task(struct starpu_task *task, unsigned sched_ctx_id
 
				 	return _dmda_push_task(task, 2, sched_ctx);
			
 
				 }
			
 
				 
			
 
				-static int dm_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-	return _dm_push_task(task, 1, sched_ctx);
			
 
				-}
			
 
				-
			
 
				 static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
@@ -575,12 +565,6 @@ static int dm_push_task(struct starpu_task *task, unsigned sched_ctx_id)
 
				 	return _dm_push_task(task, 0, sched_ctx);
			
 
				 }
			
 
				 
			
 
				-static int dmda_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-	return _dmda_push_task(task, 1, sched_ctx);
			
 
				-}
			
 
				-
			
 
				 static int dmda_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
@@ -690,7 +674,6 @@ struct starpu_sched_policy_s _starpu_sched_dm_policy = {
 
				 	.init_sched = initialize_dmda_policy,
			
 
				 	.deinit_sched = deinitialize_dmda_policy,
			
 
				 	.push_task = dm_push_task, 
			
 
				-	.push_prio_task = dm_push_prio_task,
			
 
				 	.pop_task = dmda_pop_task,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = dmda_pop_every_task,
			
@@ -703,7 +686,6 @@ struct starpu_sched_policy_s _starpu_sched_dmda_policy = {
 
				 	.init_sched = initialize_dmda_policy,
			
 
				 	.deinit_sched = deinitialize_dmda_policy,
			
 
				 	.push_task = dmda_push_task, 
			
 
				-	.push_prio_task = dmda_push_prio_task, 
			
 
				 	.pop_task = dmda_pop_task,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = dmda_pop_every_task,
			
@@ -716,7 +698,6 @@ struct starpu_sched_policy_s _starpu_sched_dmda_sorted_policy = {
 
				 	.init_sched = initialize_dmda_sorted_policy,
			
 
				 	.deinit_sched = deinitialize_dmda_policy,
			
 
				 	.push_task = dmda_push_sorted_task, 
			
 
				-	.push_prio_task = dmda_push_sorted_task, 
			
 
				 	.pop_task = dmda_pop_ready_task,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = dmda_pop_every_task,
			
@@ -729,7 +710,6 @@ struct starpu_sched_policy_s _starpu_sched_dmda_ready_policy = {
 
				 	.init_sched = initialize_dmda_policy,
			
 
				 	.deinit_sched = deinitialize_dmda_policy,
			
 
				 	.push_task = dmda_push_task, 
			
 
				-	.push_prio_task = dmda_push_prio_task, 
			
 
				 	.pop_task = dmda_pop_ready_task,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = dmda_pop_every_task,
			
--- a/src/sched_policies/detect_combined_workers.c
+++ b/src/sched_policies/detect_combined_workers.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,8 +14,8 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				+#include <starpu.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/workers.h>
			
 
				 
			
@@ -83,7 +83,10 @@ static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array,
 
				 	}
			
 
				 	
			
 
				 	/* If there is at least 2 children that are valid, we combined them. */
			
 
				-	if (cpu_children_cnt > 1 && worker_cnt_rec > 0)
			
 
				+	int maxsize = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
			
 
				+	int minsize = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
			
 
				+
			
 
				+	if (cpu_children_cnt > 1 && worker_cnt_rec > 0 && worker_cnt_rec <= maxsize && worker_cnt_rec >= minsize)
			
 
				 		starpu_combined_worker_assign_workerid(worker_cnt_rec, worker_array_rec);
			
 
				 
			
 
				 	return (cpu_children_cnt == obj->arity);
			
@@ -101,7 +104,9 @@ static void find_combinations_with_hwloc(struct starpu_machine_topology_s *topol
 
				 	root = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
			
 
				 	find_combinations_with_hwloc_rec(root, worker_array, &worker_cnt);
			
 
				 }
			
 
				+
			
 
				 #else
			
 
				+
			
 
				 static void find_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
			
 
				 {
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
@@ -140,11 +145,40 @@ static void find_combinations_without_hwloc(struct starpu_machine_topology_s *to
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
			
 
				+{
			
 
				+	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+
			
 
				+	int cpu_workers[STARPU_NMAXWORKERS];
			
 
				+	unsigned ncpus = 0;
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < topology->nworkers; i++)
			
 
				+	{
			
 
				+		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
			
 
				+			cpu_workers[ncpus++] = i;
			
 
				+	}
			
 
				+
			
 
				+	if (ncpus > 0)
			
 
				+	{
			
 
				+		int ret;
			
 
				+		ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
			
 
				+		STARPU_ASSERT(ret >= 0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
			
 
				 {
			
 
				+	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+
			
 
				+	if (config->user_conf && config->user_conf->single_combined_worker)
			
 
				+		combine_all_cpu_workers(topology);
			
 
				+	else {
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				-	find_combinations_with_hwloc(topology);
			
 
				+		find_combinations_with_hwloc(topology);
			
 
				+		//find_combinations_without_hwloc(topology);
			
 
				 #else
			
 
				-	find_combinations_without_hwloc(topology);
			
 
				+		find_combinations_without_hwloc(topology);
			
 
				 #endif
			
 
				+	}
			
 
				 }
			
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -97,13 +97,6 @@ static int push_task_eager_policy(struct starpu_task *task, unsigned sched_ctx_i
 
				 	return _starpu_fifo_push_task(fifo, sched_ctx->sched_mutex[0], sched_ctx->sched_cond[0], task);
			
 
				 }
			
 
				 
			
 
				-static int push_prio_task_eager_policy(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-	struct starpu_fifo_taskq_s *fifo = (struct starpu_fifo_taskq_s*)sched_ctx->policy_data;
			
 
				-	return _starpu_fifo_push_prio_task(fifo, sched_ctx->sched_mutex[0], sched_ctx->sched_cond[0], task);
			
 
				-}
			
 
				-
			
 
				 static struct starpu_task *pop_every_task_eager_policy(unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
@@ -136,26 +129,9 @@ struct starpu_sched_policy_s _starpu_sched_eager_policy = {
 
				 	.init_sched_for_workers = initialize_eager_center_policy_for_workers,
			
 
				 	.deinit_sched = deinitialize_eager_center_policy,
			
 
				 	.push_task = push_task_eager_policy,
			
 
				-	.push_task_notify = NULL,
			
 
				-	.push_prio_task = push_prio_task_eager_policy,
			
 
				 	.pop_task = pop_task_eager_policy,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = pop_every_task_eager_policy,
			
 
				 	.policy_name = "eager",
			
 
				 	.policy_description = "greedy policy"
			
 
				 };
			
 
				-
			
 
				-struct starpu_sched_policy_s _starpu_sched_no_prio_policy = {
			
 
				-	.init_sched = initialize_eager_center_policy,
			
 
				-	.init_sched_for_workers = initialize_eager_center_policy_for_workers,
			
 
				-	.deinit_sched = deinitialize_eager_center_policy,
			
 
				-	.push_task = push_task_eager_policy,
			
 
				-	.push_task_notify = NULL,
			
 
				-	/* we use the same method in spite of the priority */
			
 
				-	.push_prio_task = push_task_eager_policy,
			
 
				-	.pop_task = pop_task_eager_policy,
			
 
				-	.post_exec_hook = NULL,
			
 
				-	.pop_every_task = pop_every_task_eager_policy,
			
 
				-	.policy_name = "no-prio",
			
 
				-	.policy_description = "eager without priority"
			
 
				-};
			
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -200,7 +200,6 @@ struct starpu_sched_policy_s _starpu_sched_prio_policy = {
 
				 	.deinit_sched = deinitialize_eager_center_priority_policy,
			
 
				 	/* we always use priorities in that policy */
			
 
				 	.push_task = _starpu_priority_push_task,
			
 
				-	.push_prio_task = _starpu_priority_push_task,
			
 
				 	.pop_task = _starpu_priority_pop_task,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
--- a/src/sched_policies/fifo_queues.c
+++ b/src/sched_policies/fifo_queues.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -46,26 +46,14 @@ void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo)
 
				 	free(fifo);
			
 
				 }
			
 
				 
			
 
				-int _starpu_fifo_push_prio_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				-{
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				-
			
 
				-	STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				-	starpu_task_list_push_back(&fifo_queue->taskq, task);
			
 
				-	fifo_queue->ntasks++;
			
 
				-	fifo_queue->nprocessed++;
			
 
				-
			
 
				-	PTHREAD_COND_SIGNAL(sched_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				+/* TODO: revert front/back? */
			
 
				 
			
 
				 int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task)
			
 
				 {
			
 
				 	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 
			
 
				 	STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				+	/* TODO: if prio, put at back */
			
 
				 	starpu_task_list_push_front(&fifo_queue->taskq, task);
			
 
				 	fifo_queue->ntasks++;
			
 
				 	fifo_queue->nprocessed++;
			
@@ -94,6 +82,7 @@ struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo_queue
 
				 		
			
 
				 		STARPU_TRACE_JOB_POP(task, 0);
			
 
				 	}
			
 
				+	
			
 
				 	return task;
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/fifo_queues.h
+++ b/src/sched_policies/fifo_queues.h
@@ -42,7 +42,6 @@ struct starpu_fifo_taskq_s*_starpu_create_fifo(void);
 
				 void _starpu_destroy_fifo(struct starpu_fifo_taskq_s *fifo);
			
 
				 
			
 
				 int _starpu_fifo_push_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
			
 
				-int _starpu_fifo_push_prio_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
			
 
				 
			
 
				 struct starpu_task *_starpu_fifo_pop_task(struct starpu_fifo_taskq_s *fifo, int workerid);
			
 
				 struct starpu_task *_starpu_fifo_pop_every_task(struct starpu_fifo_taskq_s *fifo, pthread_mutex_t *sched_mutex, int workerid);
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,6 +24,7 @@
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				 #include <starpu_parameters.h>
			
 
				 #include <starpu_task_bundle.h>
			
 
				+#include <starpu_top.h>
			
 
				 
			
 
				 typedef struct {
			
 
				 	double alpha;
			
@@ -37,6 +38,21 @@ double exp_end[STARPU_NMAXWORKERS];
 
				 double exp_len[STARPU_NMAXWORKERS];
			
 
				 double ntasks[STARPU_NMAXWORKERS];
			
 
				 
			
 
				+
			
 
				+const float alpha_minimum=0;
			
 
				+const float alpha_maximum=10.0;
			
 
				+const float beta_minimum=0;
			
 
				+const float beta_maximum=10.0;
			
 
				+const float gamma_minimum=0;
			
 
				+const float gamma_maximum=10000.0;
			
 
				+const float idle_power_minimum=0;
			
 
				+const float idle_power_maximum=10000.0;
			
 
				+
			
 
				+void param_modified(struct starputop_param_t* d){
			
 
				+	//just to show parameter modification
			
 
				+	fprintf(stderr,"%s has been modified : alpha=%f|beta=%f|gamma=%f|idle_power=%f !\n", 
			
 
				+		d->name, alpha,beta,_gamma,idle_power);
			
 
				+}
			
 
				 static void heft_init_for_workers(unsigned sched_ctx_id, unsigned nnew_workers)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
@@ -103,6 +119,11 @@ static void heft_init(unsigned sched_ctx_id)
 
				 	if (strval_idle_power)
			
 
				 		hd->idle_power = atof(strval_idle_power);
			
 
				 
			
 
				+	starputop_register_parameter_float("HEFT_ALPHA", &hd->alpha, alpha_minimum,alpha_maximum,param_modified);
			
 
				+	starputop_register_parameter_float("HEFT_BETA", &hd->beta, beta_minimum,beta_maximum,param_modified);
			
 
				+	starputop_register_parameter_float("HEFT_GAMMA", &hd->_gamma, gamma_minimum,gamma_maximum,param_modified);
			
 
				+	starputop_register_parameter_float("HEFT_IDLE_POWER", &hd->idle_power, idle_power_minimum,idle_power_maximum,param_modified);
			
 
				+
			
 
				 	unsigned workerid_ctx;
			
 
				 
			
 
				 	for (workerid_ctx = 0; workerid_ctx < nworkers; workerid_ctx++)
			
@@ -181,28 +202,32 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	PTHREAD_MUTEX_LOCK(best_worker->sched_mutex);
			
 
				 	exp_end[best_workerid] += predicted;
			
 
				 	exp_len[best_workerid] += predicted;
			
 
				-
			
 
				 	ntasks[best_workerid]++;
			
 
				 	PTHREAD_MUTEX_UNLOCK(best_worker->sched_mutex);
			
 
				 
			
 
				 	task->predicted = predicted;
			
 
				 
			
 
				+	if (starpu_top_status_get())
			
 
				+		starputop_task_prevision(task, best_workerid, 
			
 
				+					(unsigned long long)(exp_end[best_workerid]-predicted)/1000,
			
 
				+					(unsigned long long)exp_end[best_workerid]/1000);
			
 
				+
			
 
				 	if (starpu_get_prefetch_flag())
			
 
				 	{
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(best_workerid);
			
 
				 		starpu_prefetch_task_input_on_node(task, memory_node);
			
 
				 	}
			
 
				-	
			
 
				+
			
 
				 	return starpu_push_local_task(best_workerid, task, prio);
			
 
				 }
			
 
				 
			
 
				 static void compute_all_performance_predictions(struct starpu_task *task,
			
 
				-						double *local_task_length, double *exp_end,
			
 
				-						double *max_exp_endp, double *best_exp_endp,
			
 
				-						double *local_data_penalty,
			
 
				-						double *local_power, int *forced_best,
			
 
				-						struct starpu_task_bundle *bundle,
			
 
				-						struct starpu_sched_ctx *sched_ctx )
			
 
				+					double *local_task_length, double *exp_end,
			
 
				+					double *max_exp_endp, double *best_exp_endp,
			
 
				+					double *local_data_penalty,
			
 
				+					double *local_power, int *forced_best,
			
 
				+					struct starpu_task_bundle *bundle,
			
 
				+					struct starpu_sched_ctx *sched_ctx )
			
 
				 {
			
 
				   int calibrating = 0;
			
 
				   double max_exp_end = DBL_MIN;
			
@@ -223,7 +248,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				       exp_start[worker] = STARPU_MAX(exp_start[worker], starpu_timing_now());
			
 
				       exp_end[worker_in_ctx] = exp_start[worker] + exp_len[worker];
			
 
				       if (exp_end[worker_in_ctx] > max_exp_end)
			
 
				- 	max_exp_end = exp_end[worker_in_ctx];
			
 
				+ 		max_exp_end = exp_end[worker_in_ctx];
			
 
				 
			
 
				       if (!starpu_worker_may_execute_task(worker, task))
			
 
				 	{
			
@@ -246,8 +271,6 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 	local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch);
			
 
				       }
			
 
				 
			
 
				-      //      printf("%d: local task len = %2.2f perf model %d\n", worker, local_task_length[worker_in_ctx], task->cl->model->type);
			
 
				-
			
 
				       double ntasks_end = ntasks[worker] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				       if (ntasks_best == -1
			
@@ -318,9 +341,9 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	struct starpu_task_bundle *bundle = task->bundle;
			
 
				 
			
 
				 	compute_all_performance_predictions(task, local_task_length, exp_end,
			
 
				-					    &max_exp_end, &best_exp_end,
			
 
				-					    local_data_penalty,
			
 
				-					    local_power, &forced_best, bundle, sched_ctx);
			
 
				+					&max_exp_end, &best_exp_end,
			
 
				+					local_data_penalty,
			
 
				+					local_power, &forced_best, bundle, sched_ctx);
			
 
				 
			
 
				 	/* If there is no prediction available for that task with that arch we
			
 
				 	 * want to speed-up calibration time so we force this measurement */
			
@@ -400,11 +423,6 @@ static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	return push_task_on_best_worker(task, best, model_best, prio);
			
 
				 }
			
 
				 
			
 
				-static int heft_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				-{
			
 
				-        return _heft_push_task(task, 1, sched_ctx_id);
			
 
				-}
			
 
				-
			
 
				 static int heft_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	if (task->priority > 0)
			
@@ -424,13 +442,11 @@ struct starpu_sched_policy_s heft_policy = {
 
				 	.init_sched = heft_init,
			
 
				 	.deinit_sched = heft_deinit,
			
 
				 	.push_task = heft_push_task, 
			
 
				-	.push_prio_task = heft_push_prio_task, 
			
 
				 	.push_task_notify = heft_push_task_notify,
			
 
				 	.pop_task = NULL,
			
 
				 	.pop_every_task = NULL,
			
 
				 	.post_exec_hook = heft_post_exec_hook,
			
 
				 	.policy_name = "heft",
			
 
				 	.policy_description = "Heterogeneous Earliest Finish Task",
			
 
				-	.init_sched_for_workers = heft_init_for_workers
			
 
				-	
			
 
				+	.init_sched_for_workers = heft_init_for_workers	
			
 
				 };
			
--- a/src/sched_policies/parallel_greedy.c
+++ b/src/sched_policies/parallel_greedy.c
@@ -16,6 +16,7 @@
 
				 
			
 
				 #include <core/workers.h>
			
 
				 #include <sched_policies/fifo_queues.h>
			
 
				+#include <common/barrier.h>
			
 
				 
			
 
				 /* the former is the actual queue, the latter some container */
			
 
				 static struct starpu_fifo_taskq_s *fifo;
			
@@ -42,7 +43,7 @@ static void initialize_pgreedy_policy(unsigned sched_ctx_id)
 
				 	fifo = _starpu_create_fifo();
			
 
				 
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-        struct starpu_machine_topology_s *topology = &config->topology;
			
 
				+    struct starpu_machine_topology_s *topology = &config->topology;
			
 
				 
			
 
				 	_starpu_sched_find_worker_combinations(topology);
			
 
				 
			
@@ -244,7 +245,6 @@ struct starpu_sched_policy_s _starpu_sched_pgreedy_policy = {
 
				 	.init_sched = initialize_pgreedy_policy,
			
 
				 	.deinit_sched = deinitialize_pgreedy_policy,
			
 
				 	.push_task = push_task_pgreedy_policy,
			
 
				-	.push_prio_task = push_task_pgreedy_policy,
			
 
				 	.pop_task = pop_task_pgreedy_policy,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -19,17 +19,15 @@
 
				 #include <float.h>
			
 
				 #include <limits.h>
			
 
				 #include <core/workers.h>
			
 
				-#include <sched_policies/fifo_queues.h>
			
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				 #include <starpu_parameters.h>
			
 
				+#include <common/barrier.h>
			
 
				 
			
 
				 static pthread_mutex_t big_lock;
			
 
				 
			
 
				 static unsigned nworkers, ncombinedworkers;
			
 
				-static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
			
 
				-static unsigned napplicable_perf_archtypes = 0;
			
 
				-
			
 
				-static struct starpu_fifo_taskq_s *queue_array[STARPU_NMAXWORKERS];
			
 
				+//static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
			
 
				+//static unsigned napplicable_perf_archtypes = 0;
			
 
				 
			
 
				 static pthread_cond_t sched_cond[STARPU_NMAXWORKERS];
			
 
				 static pthread_mutex_t sched_mutex[STARPU_NMAXWORKERS];
			
@@ -39,25 +37,33 @@ static double beta = STARPU_DEFAULT_BETA;
 
				 static double _gamma = STARPU_DEFAULT_GAMMA;
			
 
				 static double idle_power = 0.0;
			
 
				 
			
 
				-static struct starpu_task *parallel_heft_pop_task(void)
			
 
				+static double worker_exp_start[STARPU_NMAXWORKERS];
			
 
				+static double worker_exp_end[STARPU_NMAXWORKERS];
			
 
				+static double worker_exp_len[STARPU_NMAXWORKERS];
			
 
				+static int ntasks[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+static void parallel_heft_post_exec_hook(struct starpu_task *task)
			
 
				 {
			
 
				-	struct starpu_task *task;
			
 
				+	if (!task->cl || task->execute_on_a_specific_worker)
			
 
				+		return;
			
 
				 
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				-	struct starpu_fifo_taskq_s *fifo = queue_array[workerid];
			
 
				-	task = _starpu_fifo_pop_task(fifo, -1);
			
 
				-	if (task) {
			
 
				-		double model = task->predicted;
			
 
				+	double model = task->predicted;
			
 
				 	
			
 
				-		fifo->exp_len -= model;
			
 
				-		fifo->exp_start = starpu_timing_now() + model;
			
 
				-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				-	}
			
 
				-
			
 
				-	return task;
			
 
				+	if (model < 0.0)
			
 
				+		model = 0.0;
			
 
				+	
			
 
				+	/* Once we have executed the task, we can update the predicted amount
			
 
				+	 * of work. */
			
 
				+	PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
			
 
				+	worker_exp_len[workerid] -= model;
			
 
				+	worker_exp_start[workerid] = starpu_timing_now();
			
 
				+	worker_exp_end[workerid] = worker_exp_start[workerid] + worker_exp_len[workerid];
			
 
				+	ntasks[workerid]--;
			
 
				+	PTHREAD_MUTEX_UNLOCK(&sched_mutex[workerid]);
			
 
				 }
			
 
				 
			
 
				-static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, int prio)
			
 
				+static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double exp_end_predicted, int prio)
			
 
				 {
			
 
				 	/* make sure someone coule execute that task ! */
			
 
				 	STARPU_ASSERT(best_workerid != -1);
			
@@ -72,33 +78,20 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	if (starpu_get_prefetch_flag())
			
 
				 		starpu_prefetch_task_input_on_node(task, memory_node);
			
 
				 
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				+
			
 
				 	if (is_basic_worker)
			
 
				 	{
			
 
				-		PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				-
			
 
				-		struct starpu_fifo_taskq_s *fifo;
			
 
				-		fifo = queue_array[best_workerid];
			
 
				-	
			
 
				-		fifo->exp_end += predicted;
			
 
				-		fifo->exp_len += predicted;
			
 
				-	
			
 
				-		task->predicted = predicted;
			
 
				+		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
			
 
				+		worker_exp_len[best_workerid] += exp_end_predicted - worker_exp_end[best_workerid];
			
 
				+		worker_exp_end[best_workerid] = exp_end_predicted;
			
 
				+		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
			
 
				 	
			
 
				-		int ret;
			
 
				+		ntasks[best_workerid]++;
			
 
				 
			
 
				-		if (prio)
			
 
				-		{
			
 
				-			ret = _starpu_fifo_push_prio_task(queue_array[best_workerid],
			
 
				-				&sched_mutex[best_workerid], &sched_cond[best_workerid], task);
			
 
				-		}
			
 
				-		else {
			
 
				-			ret = _starpu_fifo_push_task(queue_array[best_workerid],
			
 
				-				&sched_mutex[best_workerid], &sched_cond[best_workerid], task);
			
 
				-		}
			
 
				-
			
 
				-		PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				-
			
 
				-		return ret;
			
 
				+		ret = starpu_push_local_task(best_workerid, task, prio);
			
 
				 	}
			
 
				 	else {
			
 
				 		/* This is a combined worker so we create task aliases */
			
@@ -107,11 +100,6 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 		int worker_size = combined_worker->worker_size;
			
 
				 		int *combined_workerid = combined_worker->combined_workerid;
			
 
				 
			
 
				-		int ret = 0;
			
 
				-		int i;
			
 
				-		
			
 
				-		task->predicted = predicted;
			
 
				-
			
 
				 		starpu_job_t j = _starpu_get_job_associated_to_task(task);
			
 
				 		j->task_size = worker_size;
			
 
				 		j->combined_workerid = best_workerid;
			
@@ -120,36 +108,28 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 		PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
			
 
				 		PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
			
 
				 
			
 
				-		PTHREAD_MUTEX_LOCK(&big_lock);
			
 
				-
			
 
				+		int i;
			
 
				 		for (i = 0; i < worker_size; i++)
			
 
				 		{
			
 
				 			struct starpu_task *alias = _starpu_create_task_alias(task);
			
 
				 			int local_worker = combined_workerid[i];
			
 
				 
			
 
				-			struct starpu_fifo_taskq_s *fifo;
			
 
				-			fifo = queue_array[local_worker];
			
 
				-		
			
 
				-			fifo->exp_end += predicted;
			
 
				-			fifo->exp_len += predicted;
			
 
				-		
			
 
				-			alias->predicted = predicted;
			
 
				+			alias->predicted = exp_end_predicted - worker_exp_end[local_worker];
			
 
				+	
			
 
				+			worker_exp_len[local_worker] += exp_end_predicted - worker_exp_end[local_worker];
			
 
				+			worker_exp_end[local_worker] = exp_end_predicted;
			
 
				+			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
			
 
				 		
			
 
				-			if (prio)
			
 
				-			{
			
 
				-				ret |= _starpu_fifo_push_prio_task(queue_array[local_worker],
			
 
				-					&sched_mutex[local_worker], &sched_cond[local_worker], alias);
			
 
				-			}
			
 
				-			else {
			
 
				-				ret |= _starpu_fifo_push_task(queue_array[local_worker],
			
 
				-					&sched_mutex[local_worker], &sched_cond[local_worker], alias);
			
 
				-			}
			
 
				+			ntasks[local_worker]++;
			
 
				+	
			
 
				+			ret |= starpu_push_local_task(local_worker, alias, prio);
			
 
				 		}
			
 
				 
			
 
				-		PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				-
			
 
				-		return ret;
			
 
				 	}
			
 
				+
			
 
				+	PTHREAD_MUTEX_UNLOCK(&big_lock);
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static double compute_expected_end(int workerid, double length)
			
@@ -157,9 +137,7 @@ static double compute_expected_end(int workerid, double length)
 
				 	if (workerid < (int)nworkers)
			
 
				 	{
			
 
				 		/* This is a basic worker */
			
 
				-		struct starpu_fifo_taskq_s *fifo;
			
 
				-		fifo = queue_array[workerid];
			
 
				-		return (fifo->exp_start + fifo->exp_len + length);
			
 
				+		return worker_exp_start[workerid] + worker_exp_len[workerid] + length;
			
 
				 	}
			
 
				 	else {
			
 
				 		/* This is a combined worker, the expected end is the end for the latest worker */
			
@@ -172,9 +150,9 @@ static double compute_expected_end(int workerid, double length)
 
				 		int i;
			
 
				 		for (i = 0; i < worker_size; i++)
			
 
				 		{
			
 
				-			struct starpu_fifo_taskq_s *fifo;
			
 
				-			fifo = queue_array[combined_workerid[i]];
			
 
				-			double local_exp_end = (fifo->exp_start + fifo->exp_len + length);
			
 
				+			double local_exp_start = worker_exp_start[combined_workerid[i]];
			
 
				+			double local_exp_len = worker_exp_len[combined_workerid[i]];
			
 
				+			double local_exp_end = local_exp_start + local_exp_len + length;
			
 
				 			exp_end = STARPU_MAX(exp_end, local_exp_end);
			
 
				 		}
			
 
				 
			
@@ -184,41 +162,34 @@ static double compute_expected_end(int workerid, double length)
 
				 
			
 
				 static double compute_ntasks_end(int workerid)
			
 
				 {
			
 
				-  enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				-  if (workerid < (int)nworkers)
			
 
				-    {
			
 
				-      /* This is a basic worker */
			
 
				-      struct starpu_fifo_taskq_s *fifo;
			
 
				-      fifo = queue_array[workerid];
			
 
				-      return fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				-    }
			
 
				-  else {
			
 
				-    /* This is a combined worker, the expected end is the end for the latest worker */
			
 
				-    int worker_size;
			
 
				-    int *combined_workerid;
			
 
				-    starpu_combined_worker_get_description(workerid, &worker_size, &combined_workerid);
			
 
				-
			
 
				-    int ntasks_end;
			
 
				-
			
 
				-    int i;
			
 
				-    for (i = 0; i < worker_size; i++)
			
 
				-      {
			
 
				-	struct starpu_fifo_taskq_s *fifo;
			
 
				-	fifo = queue_array[combined_workerid[i]];
			
 
				-	/* XXX: this is actually bogus: not all pushed tasks are necessarily parallel... */
			
 
				-	ntasks_end = STARPU_MAX(ntasks_end, fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch));
			
 
				-      }
			
 
				-
			
 
				-    return ntasks_end;
			
 
				-  }
			
 
				+	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
			
 
				+	if (workerid < (int)nworkers)
			
 
				+	{
			
 
				+		/* This is a basic worker */
			
 
				+		return ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
			
 
				+	}
			
 
				+	else {
			
 
				+		/* This is a combined worker, the expected end is the end for the latest worker */
			
 
				+		int worker_size;
			
 
				+		int *combined_workerid;
			
 
				+		starpu_combined_worker_get_description(workerid, &worker_size, &combined_workerid);
			
 
				+
			
 
				+		int ntasks_end=0;
			
 
				+
			
 
				+		int i;
			
 
				+		for (i = 0; i < worker_size; i++)
			
 
				+		{
			
 
				+			/* XXX: this is actually bogus: not all pushed tasks are necessarily parallel... */
			
 
				+			ntasks_end = STARPU_MAX(ntasks_end, ntasks[combined_workerid[i]] / starpu_worker_get_relative_speedup(perf_arch));
			
 
				+		}
			
 
				+
			
 
				+		return ntasks_end;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, struct starpu_sched_ctx *sched_ctx)
			
 
				+static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
			
 
				 {
			
 
				-	/* find the queue */
			
 
				-	struct starpu_fifo_taskq_s *fifo;
			
 
				-	unsigned worker, worker_in_ctx;
			
 
				-
			
 
				+	unsigned worker;
			
 
				 	int best = -1;
			
 
				 	
			
 
				 	/* this flag is set if the corresponding worker is selected because
			
@@ -228,7 +199,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 
				 	double local_task_length[nworkers+ncombinedworkers];
			
 
				 	double local_data_penalty[nworkers+ncombinedworkers];
			
 
				 	double local_power[nworkers+ncombinedworkers];
			
 
				-	double exp_end[nworkers+ncombinedworkers];
			
 
				+	double local_exp_end[nworkers+ncombinedworkers];
			
 
				 	double fitness[nworkers+ncombinedworkers];
			
 
				 
			
 
				 	double max_exp_end = 0.0;
			
@@ -236,33 +207,26 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 
				 	int skip_worker[nworkers+ncombinedworkers];
			
 
				 
			
 
				 	double best_exp_end = DBL_MAX;
			
 
				-	double model_best = 0.0;
			
 
				-	double penality_best = 0.0;
			
 
				+	//double penality_best = 0.0;
			
 
				 
			
 
				 	int ntasks_best = -1;
			
 
				 	double ntasks_best_end = 0.0;
			
 
				 	int calibrating = 0;
			
 
				 
			
 
				-        /* A priori, we know all estimations */
			
 
				+	/* A priori, we know all estimations */
			
 
				 	int unknown = 0;
			
 
				 
			
 
				-	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
			
 
				+	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				-                worker = sched_ctx->workerid[worker_in_ctx];
			
 
				-
			
 
				-		fifo = queue_array[worker];
			
 
				-
			
 
				 		/* Sometimes workers didn't take the tasks as early as we expected */
			
 
				-		fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				-		fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				-		if (fifo->exp_end > max_exp_end)
			
 
				-			max_exp_end = fifo->exp_end;
			
 
				+		worker_exp_start[worker] = STARPU_MAX(worker_exp_start[worker], starpu_timing_now());
			
 
				+		worker_exp_end[worker] = worker_exp_start[worker] + worker_exp_len[worker];
			
 
				+		if (worker_exp_end[worker] > max_exp_end)
			
 
				+			max_exp_end = worker_exp_end[worker];
			
 
				 	}
			
 
				 
			
 
				-	for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
			
 
				+	for (worker = 0; worker < (nworkers+ncombinedworkers); worker++)
			
 
				 	{
			
 
				-                worker = sched_ctx->workerid[worker_in_ctx];
			
 
				-
			
 
				 		if (!starpu_combined_worker_may_execute_task(worker, task))
			
 
				 		{
			
 
				 			/* no one on that queue may execute this task */
			
@@ -304,12 +268,14 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 
				 		if (unknown)
			
 
				 			continue;
			
 
				 
			
 
				-		exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
			
 
				+		local_exp_end[worker] = compute_expected_end(worker, local_task_length[worker]);
			
 
				+
			
 
				+		//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker], local_exp_end[worker]);
			
 
				 
			
 
				-		if (exp_end[worker] < best_exp_end)
			
 
				+		if (local_exp_end[worker] < best_exp_end)
			
 
				 		{
			
 
				 			/* a better solution was found */
			
 
				-			best_exp_end = exp_end[worker];
			
 
				+			best_exp_end = local_exp_end[worker];
			
 
				 		}
			
 
				 
			
 
				 		local_power[worker] = starpu_task_expected_power(task, perf_arch);
			
@@ -321,13 +287,12 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 
				 		forced_best = ntasks_best;
			
 
				 
			
 
				 	double best_fitness = -1;
			
 
				-	
			
 
				+
			
 
				+
			
 
				 	if (forced_best == -1)
			
 
				 	{
			
 
				-
			
 
				-	        for (worker_in_ctx = 0; worker_in_ctx < nworkers; worker_in_ctx++)
			
 
				-	        {
			
 
				-		        worker = sched_ctx->workerid[worker_in_ctx];
			
 
				+		for (worker = 0; worker < nworkers+ncombinedworkers; worker++)
			
 
				+		{
			
 
				 
			
 
				 			if (skip_worker[worker])
			
 
				 			{
			
@@ -335,15 +300,15 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 
				 				continue;
			
 
				 			}
			
 
				 	
			
 
				-			fitness[worker] = alpha*(exp_end[worker] - best_exp_end) 
			
 
				+			fitness[worker] = alpha*(local_exp_end[worker] - best_exp_end) 
			
 
				 					+ beta*(local_data_penalty[worker])
			
 
				 					+ _gamma*(local_power[worker]);
			
 
				 
			
 
				-			if (exp_end[worker] > max_exp_end)
			
 
				+			if (local_exp_end[worker] > max_exp_end)
			
 
				 				/* This placement will make the computation
			
 
				 				 * longer, take into account the idle
			
 
				 				 * consumption of other cpus */
			
 
				-				fitness[worker] += _gamma * idle_power * (exp_end[worker] - max_exp_end) / 1000000.0;
			
 
				+				fitness[worker] += _gamma * idle_power * (local_exp_end[worker] - max_exp_end) / 1000000.0;
			
 
				 
			
 
				 			if (best == -1 || fitness[worker] < best_fitness)
			
 
				 			{
			
@@ -351,53 +316,44 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, str
 
				 				best_fitness = fitness[worker];
			
 
				 				best = worker;
			
 
				 			}
			
 
				+
			
 
				+		//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker], local_exp_end[worker] - best_exp_end, local_data_penalty[worker]);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	STARPU_ASSERT(forced_best != -1 || best != -1);
			
 
				-	
			
 
				+
			
 
				 	if (forced_best != -1)
			
 
				 	{
			
 
				 		/* there is no prediction available for that task
			
 
				 		 * with that arch we want to speed-up calibration time
			
 
				 		 * so we force this measurement */
			
 
				 		best = forced_best;
			
 
				-		model_best = 0.0;
			
 
				-		penality_best = 0.0;
			
 
				+		//penality_best = 0.0;
			
 
				+		best_exp_end = local_exp_end[best];
			
 
				 	}
			
 
				 	else 
			
 
				 	{
			
 
				-		model_best = local_task_length[best];
			
 
				-		penality_best = local_data_penalty[best];
			
 
				+                //penality_best = local_data_penalty[best];
			
 
				+		best_exp_end = local_exp_end[best];
			
 
				 	}
			
 
				 
			
 
				 	/* we should now have the best worker in variable "best" */
			
 
				-	return push_task_on_best_worker(task, best, model_best, prio);
			
 
				+	return push_task_on_best_worker(task, best, best_exp_end, prio);
			
 
				 }
			
 
				 
			
 
				-static int parallel_heft_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				+static int parallel_heft_push_task(struct starpu_task *task)
			
 
				 {
			
 
				-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-
			
 
				-	return _parallel_heft_push_task(task, 1, sched_ctx);
			
 
				-}
			
 
				-
			
 
				-static int parallel_heft_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				-{ 
			
 
				-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				 	if (task->priority == STARPU_MAX_PRIO)
			
 
				-	  return _parallel_heft_push_task(task, 1, sched_ctx);
			
 
				+		return _parallel_heft_push_task(task, 1);
			
 
				 
			
 
				-	return _parallel_heft_push_task(task, 0, sched_ctx);
			
 
				+	return _parallel_heft_push_task(task, 0);
			
 
				 }
			
 
				 
			
 
				-static void initialize_parallel_heft_policy(unsigned sched_ctx_id) 
			
 
				+static void initialize_parallel_heft_policy(struct starpu_machine_topology_s *topology, 
			
 
				+	 __attribute__ ((unused)) struct starpu_sched_policy_s *_policy) 
			
 
				 {
			
 
				-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-
			
 
				-	nworkers = sched_ctx->nworkers_in_ctx;
			
 
				-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	struct starpu_machine_topology_s *topology = &config->topology;
			
 
				+	nworkers = topology->nworkers;
			
 
				 
			
 
				 	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");
			
 
				 	if (strval_alpha)
			
@@ -419,11 +375,13 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	ncombinedworkers = topology->ncombinedworkers;
			
 
				 
			
 
				-	unsigned workerid, workerid_ctx;
			
 
				-	for (workerid_ctx = 0; workerid_ctx < nworkers; workerid_ctx++)
			
 
				+	unsigned workerid;
			
 
				+	for (workerid = 0; workerid < nworkers; workerid++)
			
 
				 	{
			
 
				-                workerid = sched_ctx->workerid[workerid_ctx];
			
 
				-		queue_array[workerid] = _starpu_create_fifo();
			
 
				+		worker_exp_start[workerid] = starpu_timing_now();
			
 
				+		worker_exp_len[workerid] = 0.0;
			
 
				+		worker_exp_end[workerid] = worker_exp_start[workerid]; 
			
 
				+		ntasks[workerid] = 0;
			
 
				 	
			
 
				 		PTHREAD_MUTEX_INIT(&sched_mutex[workerid], NULL);
			
 
				 		PTHREAD_COND_INIT(&sched_cond[workerid], NULL);
			
@@ -435,52 +393,33 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	/* We pre-compute an array of all the perfmodel archs that are applicable */
			
 
				 	unsigned total_worker_count = nworkers + ncombinedworkers;
			
 
				-	printf("ncombinedworkers = %d\n", ncombinedworkers);
			
 
				+
			
 
				 	unsigned used_perf_archtypes[STARPU_NARCH_VARIATIONS];
			
 
				 	memset(used_perf_archtypes, 0, sizeof(used_perf_archtypes));
			
 
				 
			
 
				-	int nworkers_machine = topology->nworkers;
			
 
				-
			
 
				-	for (workerid_ctx = 0; workerid_ctx < total_worker_count; workerid_ctx++)
			
 
				+	for (workerid = 0; workerid < total_worker_count; workerid++)
			
 
				 	{
			
 
				-	  workerid = (unsigned)workerid_ctx >= nworkers ? (nworkers_machine + (unsigned)workerid_ctx - nworkers) : sched_ctx->workerid[workerid_ctx];
			
 
				-	  printf("workerid = %d\n", workerid);
			
 
				 		enum starpu_perf_archtype perf_archtype = starpu_worker_get_perf_archtype(workerid);
			
 
				-		printf("perf_archtype = %d\n", perf_archtype);
			
 
				 		used_perf_archtypes[perf_archtype] = 1;
			
 
				 	}
			
 
				 
			
 
				-	napplicable_perf_archtypes = 0;
			
 
				-
			
 
				-	int arch;
			
 
				-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
			
 
				-	{
			
 
				-		if (used_perf_archtypes[arch])
			
 
				-			applicable_perf_archtypes[napplicable_perf_archtypes++] = arch;
			
 
				-	}
			
 
				-}
			
 
				+//	napplicable_perf_archtypes = 0;
			
 
				 
			
 
				-static void deinitialize_parallel_heft_policy(unsigned sched_ctx_id) 
			
 
				-{
			
 
				-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-
			
 
				-	unsigned workerid;
			
 
				-	int workerid_in_ctx;
			
 
				-        int nworkers = sched_ctx->nworkers_in_ctx;
			
 
				-	for (workerid_in_ctx = 0; workerid_in_ctx < nworkers; workerid_in_ctx++){
			
 
				-                workerid = sched_ctx->workerid[workerid_in_ctx];
			
 
				-		_starpu_destroy_fifo(queue_array[workerid]);
			
 
				-	}
			
 
				+//	int arch;
			
 
				+//	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
			
 
				+//	{
			
 
				+//		if (used_perf_archtypes[arch])
			
 
				+//			applicable_perf_archtypes[napplicable_perf_archtypes++] = arch;
			
 
				+//	}
			
 
				 }
			
 
				 
			
 
				 /* TODO: use post_exec_hook to fix the expected start */
			
 
				 struct starpu_sched_policy_s _starpu_sched_parallel_heft_policy = {
			
 
				 	.init_sched = initialize_parallel_heft_policy,
			
 
				-	.deinit_sched = deinitialize_parallel_heft_policy,
			
 
				+	.deinit_sched = NULL,
			
 
				 	.push_task = parallel_heft_push_task, 
			
 
				-	.push_prio_task = parallel_heft_push_prio_task, 
			
 
				-	.pop_task = parallel_heft_pop_task,
			
 
				-	.post_exec_hook = NULL,
			
 
				+	.pop_task = NULL,
			
 
				+	.post_exec_hook = parallel_heft_post_exec_hook,
			
 
				 	.pop_every_task = NULL,
			
 
				 	.policy_name = "pheft",
			
 
				 	.policy_description = "parallel HEFT"
			
--- a/src/sched_policies/random_policy.c
+++ b/src/sched_policies/random_policy.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -65,18 +65,12 @@ static int _random_push_task(struct starpu_task *task, unsigned prio, struct sta
 
				 	return n;
			
 
				 }
			
 
				 
			
 
				-static int random_push_prio_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				-{	
			
 
				-	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				-
			
 
				-        return _random_push_task(task, 1, sched_ctx);
			
 
				-}
			
 
				 
			
 
				 static int random_push_task(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
 
				 
			
 
				-        return _random_push_task(task, 0, sched_ctx);
			
 
				+    return _random_push_task(task, 0, sched_ctx);
			
 
				 }
			
 
				 
			
 
				 static void initialize_random_policy_for_workers(unsigned sched_ctx_id, unsigned nnew_workers) 
			
@@ -129,8 +123,6 @@ struct starpu_sched_policy_s _starpu_sched_random_policy = {
 
				 	.init_sched_for_workers = initialize_random_policy_for_workers,
			
 
				 	.deinit_sched = NULL,
			
 
				 	.push_task = random_push_task,
			
 
				-	.push_prio_task = random_push_prio_task,
			
 
				-	.push_task_notify = NULL,
			
 
				 	.pop_task = NULL,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
--- a/src/sched_policies/stack_queues.c
+++ b/src/sched_policies/stack_queues.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -57,27 +57,16 @@ unsigned _starpu_get_stack_nprocessed(struct starpu_stack_jobq_s *stack_queue)
 
				 	return stack_queue->nprocessed;
			
 
				 }
			
 
				 
			
 
				-void _starpu_stack_push_prio_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task)
			
 
				-{
			
 
				-	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				-	total_number_of_jobs++;
			
 
				-
			
 
				-	STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				-	starpu_job_list_push_back(stack_queue->jobq, task);
			
 
				-	stack_queue->njobs++;
			
 
				-	stack_queue->nprocessed++;
			
 
				-
			
 
				-	PTHREAD_COND_SIGNAL(sched_cond);
			
 
				-	PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				-}
			
 
				-
			
 
				 void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack_queue, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task)
			
 
				 {
			
 
				 	PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				 	total_number_of_jobs++;
			
 
				 
			
 
				 	STARPU_TRACE_JOB_PUSH(task, 0);
			
 
				-	starpu_job_list_push_front(stack_queue->jobq, task);
			
 
				+	if (task->task->priority)
			
 
				+		starpu_job_list_push_back(stack_queue->jobq, task);
			
 
				+	else
			
 
				+		starpu_job_list_push_front(stack_queue->jobq, task);
			
 
				 	stack_queue->njobs++;
			
 
				 	stack_queue->nprocessed++;
			
 
				 
			
--- a/src/sched_policies/stack_queues.h
+++ b/src/sched_policies/stack_queues.h
@@ -42,7 +42,6 @@ struct starpu_stack_jobq_s {
 
				 struct starpu_stack_jobq_s *_starpu_create_stack(void);
			
 
				 
			
 
				 void _starpu_stack_push_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task);
			
 
				-void _starpu_stack_push_prio_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, starpu_job_t task);
			
 
				 
			
 
				 starpu_job_t _starpu_stack_pop_task(struct starpu_stack_jobq_s *stack, pthread_mutex_t *sched_mutex, int workerid);
			
 
				 
			
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -129,7 +129,9 @@ static struct starpu_deque_jobq_s *select_workerq(work_stealing_data *ws, unsign
 
				 
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning TODO rewrite ... this will not scale at all now
			
 
				+#endif
			
 
				 static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
			
 
				 {
			
 
				 	struct starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx(sched_ctx_id);
			
@@ -252,7 +254,6 @@ struct starpu_sched_policy_s _starpu_sched_ws_policy = {
 
				 	.init_sched = initialize_ws_policy,
			
 
				 	.deinit_sched = NULL,
			
 
				 	.push_task = ws_push_task,
			
 
				-	.push_prio_task = ws_push_task,
			
 
				 	.pop_task = ws_pop_task,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
--- a/src/top/starpu_top.c
+++ b/src/top/starpu_top.c
@@ -0,0 +1,756 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
			
 
				+ * Roy
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <starpu_top.h>
			
 
				+#include <top/starputop_message_queue.h>
			
 
				+#include <top/starputop_connection.h>
			
 
				+#include <profiling/profiling.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <pthread.h>
			
 
				+#include <common/timing.h>
			
 
				+
			
 
				+extern starputop_message_queue_t*  starputop_mt;
			
 
				+int starpu_top = 0;
			
 
				+int starputop_debug_on = 0;
			
 
				+unsigned int starputop_data_cpt = 0;
			
 
				+unsigned int starputop_param_cpt = 0;
			
 
				+starputop_data* starputop_first_data = NULL;
			
 
				+starputop_param* starputop_first_param = NULL;
			
 
				+starputop_data** starputop_datas;
			
 
				+starputop_param** starputop_params;
			
 
				+
			
 
				+sem_t starputop_wait_for_go;
			
 
				+pthread_mutex_t starputop_wait_for_continue_mutex;
			
 
				+pthread_cond_t starputop_wait_for_continue_cond = PTHREAD_COND_INITIALIZER;
			
 
				+
			
 
				+int starpu_top_status_get()
			
 
				+{
			
 
				+  return starpu_top;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+unsigned long long int current_timestamp();
			
 
				+
			
 
				+/*********************************************
			
 
				+*****************INIT FUNC********************
			
 
				+**********************************************/
			
 
				+
			
 
				+char *message_for_topdata_init(starputop_data* data);
			
 
				+char *message_for_topparam_init(starputop_param* param);
			
 
				+
			
 
				+/*
			
 
				+ * we store data and param in a tab to offer a O(1) access when the program  is
			
 
				+ * running
			
 
				+ */
			
 
				+void copy_data_and_param()
			
 
				+{
			
 
				+	printf("%s:%d trace\n", __FILE__, __LINE__);
			
 
				+	//copying datas
			
 
				+	starputop_datas = malloc(starputop_data_cpt*sizeof(starputop_data*));
			
 
				+	starputop_data* cur = starputop_first_data;
			
 
				+	unsigned int i = 0;
			
 
				+	for(i = 0; i < starputop_data_cpt; i++)
			
 
				+	{
			
 
				+		starputop_datas[i] = cur;
			
 
				+		cur = cur->next;
			
 
				+	}
			
 
				+	//copying params
			
 
				+	starputop_params = malloc(starputop_param_cpt*sizeof(starputop_param*));
			
 
				+	starputop_param* cur2 = starputop_first_param;
			
 
				+	for(i = 0; i < starputop_param_cpt; i++)
			
 
				+	{
			
 
				+		starputop_params[i] = cur2;
			
 
				+		cur2 = cur2->next;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void starputop_get_device_type(int id, char* type){
			
 
				+	enum starpu_archtype device_type=starpu_worker_get_type(id);
			
 
				+	switch (device_type)
			
 
				+	{
			
 
				+	case STARPU_CPU_WORKER:
			
 
				+		strncpy(type, "CPU",9);
			
 
				+		break;
			
 
				+	case STARPU_CUDA_WORKER:
			
 
				+		strncpy(type, "CUDA",9);
			
 
				+		break;
			
 
				+	case STARPU_OPENCL_WORKER:
			
 
				+		strncpy(type, "OPENCL",9);
			
 
				+		break;
			
 
				+	case STARPU_GORDON_WORKER:
			
 
				+		strncpy(type, "GORDON",9);
			
 
				+		break;
			
 
				+	}  
			
 
				+}
			
 
				+
			
 
				+static void starputop_send_devices_info()
			
 
				+{
			
 
				+	char* message=malloc(5*sizeof(char));
			
 
				+	snprintf(message,5,"DEV\n");
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+
			
 
				+	unsigned int i;
			
 
				+	for(i=0;i<starpu_worker_get_count();i++)
			
 
				+	{
			
 
				+		message=malloc(sizeof(char)*128);
			
 
				+		char dev_type[10];
			
 
				+		char dev_name[64];
			
 
				+		starputop_get_device_type(i,dev_type);
			
 
				+		starpu_worker_get_name(i, dev_name,64);
			
 
				+		snprintf(message, 128, "%d;%s;%s\n", i, dev_type, dev_name);
			
 
				+		starputop_message_add(starputop_mt,message);    
			
 
				+	}
			
 
				+
			
 
				+	message=malloc(6*sizeof(char));                             
			
 
				+	snprintf(message,6,"/DEV\n");                
			
 
				+	starputop_message_add(starputop_mt,message);  
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void starputop_init_and_wait(const char* server_name){
			
 
				+	starpu_top=1;
			
 
				+	sem_init(&starputop_wait_for_go,0,0);
			
 
				+	
			
 
				+	pthread_mutex_init(&starputop_wait_for_continue_mutex, NULL);
			
 
				+	
			
 
				+	//profiling activation
			
 
				+	starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
			
 
				+
			
 
				+	//init locked queue before adding the first message
			
 
				+	starputop_mt = starputop_message_queue_new();
			
 
				+
			
 
				+	//waiting for UI to connect
			
 
				+	printf("%s:%d launching network threads\n", __FILE__, __LINE__);
			
 
				+	starputop_communications_threads_launcher();
			
 
				+
			
 
				+	//sending server information (report to protocol)
			
 
				+	char* message = malloc(strlen("SERVERINFO\n")+1);
			
 
				+	sprintf(message, "%s", "SERVERINFO\n");  
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+	message = malloc(strlen(server_name)+2);
			
 
				+	sprintf(message, "%s\n", server_name);
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+	message = malloc(25);
			
 
				+	sprintf(message, "%lld\n", current_timestamp());
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+	message = malloc(strlen("/SERVERINFO\n")+1);
			
 
				+	sprintf(message,"%s", "/SERVERINFO\n");
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+
			
 
				+
			
 
				+	//sending data list
			
 
				+	message = malloc(strlen("DATA\n")+1);
			
 
				+	sprintf(message, "%s", "DATA\n");
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+	starputop_data * cur_data = starputop_first_data;
			
 
				+	while(cur_data != NULL)
			
 
				+	{
			
 
				+		starputop_message_add(starputop_mt,message_for_topdata_init(cur_data));
			
 
				+		cur_data = cur_data->next;
			
 
				+	}
			
 
				+	message = malloc(strlen("/DATA\n")+1);
			
 
				+	sprintf(message, "%s", "/DATA\n");
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+	
			
 
				+	//sending parameter list
			
 
				+	message = malloc(strlen("PARAMS\n")+1);
			
 
				+	sprintf(message, "%s", "PARAMS\n");
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+	starputop_param * cur_param = starputop_first_param;
			
 
				+	printf("%s:%d sending parameters\n", __FILE__, __LINE__);
			
 
				+	while(cur_param != NULL){
			
 
				+	  starputop_message_add(starputop_mt,message_for_topparam_init(cur_param));
			
 
				+	  cur_param = cur_param->next;
			
 
				+	}
			
 
				+	printf("%s:%d parameters sended\n", __FILE__, __LINE__);
			
 
				+	message = malloc(strlen("/PARAMS\n")+1);
			
 
				+	sprintf(message, "%s", "/PARAMS\n");
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+	
			
 
				+	
			
 
				+	//sending DEVICE list
			
 
				+	printf("%s:%d sending devices info\n", __FILE__, __LINE__);
			
 
				+	starputop_send_devices_info();
			
 
				+	printf("%s:%d devices_info sended\n", __FILE__, __LINE__);
			
 
				+	//copying data and params
			
 
				+	copy_data_and_param();
			
 
				+	
			
 
				+	//sending READY message
			
 
				+	message = malloc(strlen("READY\n")+1);
			
 
				+	sprintf(message, "%s", "READY\n");
			
 
				+	starputop_message_add(starputop_mt,message);
			
 
				+	
			
 
				+	//This threads keeps locked while we don't receive an GO message from UI
			
 
				+	printf("%s:%d waiting for GO message\n", __FILE__, __LINE__);
			
 
				+	sem_wait(&starputop_wait_for_go);
			
 
				+}
			
 
				+
			
 
				+void starputop_enqueue_data(starputop_data * data)
			
 
				+{
			
 
				+	if(starputop_first_data == NULL)
			
 
				+	{
			
 
				+		starputop_first_data = data;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starputop_data * cur = starputop_first_data;
			
 
				+		while(cur->next != NULL)
			
 
				+			cur = cur->next;
			
 
				+		cur->next = data;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+starputop_data * starputop_add_data_boolean(
			
 
				+			const char* data_name,
			
 
				+			int active)
			
 
				+{		
			
 
				+	starputop_data * data = malloc(sizeof(starputop_data));
			
 
				+	data->id = starputop_data_cpt++;
			
 
				+	data->name = data_name;
			
 
				+	data->type = STARPUTOP_DATA_BOOLEAN;
			
 
				+	data->active = active;
			
 
				+	data->next = NULL;
			
 
				+
			
 
				+	starputop_enqueue_data(data);
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+starputop_data * starputop_add_data_integer(
			
 
				+			const char* data_name,
			
 
				+			int minimum_value,
			
 
				+			int maximum_value,
			
 
				+			int active)
			
 
				+{	
			
 
				+	starputop_data * data = malloc(sizeof(starputop_data));
			
 
				+	data->id = starputop_data_cpt++;
			
 
				+	data->name = data_name; 
			
 
				+	data->type = STARPUTOP_DATA_INTEGER;
			
 
				+	data->int_min_value = minimum_value;
			
 
				+	data->int_max_value = maximum_value;
			
 
				+	data->active = active;
			
 
				+	data->next = NULL;
			
 
				+
			
 
				+	starputop_enqueue_data(data);
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+starputop_data* starputop_add_data_float(
			
 
				+			const char* data_name,
			
 
				+			double minimum_value,
			
 
				+			double maximum_value,
			
 
				+			int active)
			
 
				+{
			
 
				+	starputop_data * data = malloc(sizeof(starputop_data));
			
 
				+	data->id = starputop_data_cpt++;
			
 
				+	data->name = data_name;
			
 
				+	data->type = STARPUTOP_DATA_FLOAT;
			
 
				+	data->double_min_value = minimum_value;
			
 
				+	data->double_max_value = maximum_value;
			
 
				+	data->active = active;
			
 
				+	data->next = NULL;
			
 
				+
			
 
				+	starputop_enqueue_data(data);
			
 
				+
			
 
				+	return data;
			
 
				+}
			
 
				+
			
 
				+char *message_for_topdata_init(starputop_data* data)
			
 
				+{
			
 
				+	char*message = malloc(256+strlen(data->name));
			
 
				+	switch(data->type)
			
 
				+	{
			
 
				+		case STARPUTOP_DATA_BOOLEAN:
			
 
				+			sprintf(message,
			
 
				+					"BOOL;%d;%s;%d\n",
			
 
				+					data->id,
			
 
				+					data->name,
			
 
				+					data->active ? 1 : 0);
			
 
				+			break;
			
 
				+		case STARPUTOP_DATA_INTEGER:
			
 
				+			sprintf(message,
			
 
				+					"INT;%d;%s;%d;%d;%d\n",
			
 
				+					data->id,
			
 
				+					data->name,
			
 
				+					data->int_min_value,
			
 
				+					data->int_max_value,
			
 
				+					data->active ? 1 : 0);
			
 
				+			break;
			
 
				+		case STARPUTOP_DATA_FLOAT:
			
 
				+			sprintf(message,
			
 
				+					"FLOAT;%d;%s;%f;%f;%d\n",
			
 
				+					data->id,
			
 
				+					data->name,
			
 
				+					data->double_min_value,
			
 
				+					data->double_max_value,
			
 
				+					data->active ? 1 : 0);
			
 
				+			break;
			
 
				+	}
			
 
				+	return message;
			
 
				+}
			
 
				+
			
 
				+char *message_for_topparam_init(starputop_param* param)
			
 
				+{
			
 
				+	char*message = NULL;
			
 
				+	int i;
			
 
				+	int length=0;
			
 
				+	switch(param->type)
			
 
				+	{
			
 
				+	case STARPUTOP_PARAM_BOOLEAN:
			
 
				+		message = malloc(256);
			
 
				+		sprintf(message,
			
 
				+				"BOOL;%d;%s;%d\n",
			
 
				+				param->id,
			
 
				+				param->name,
			
 
				+				(*(int*)(param->value)) ? 1 : 0);
			
 
				+		break;
			
 
				+	case STARPUTOP_PARAM_INTEGER:
			
 
				+		message = malloc(256);
			
 
				+		sprintf(message,
			
 
				+				"INT;%d;%s;%d;%d;%d\n",param->id,
			
 
				+				param->name,
			
 
				+				param->int_min_value,
			
 
				+				param->int_max_value,
			
 
				+				*(int*)(param->value));
			
 
				+		break;
			
 
				+	case STARPUTOP_PARAM_FLOAT:
			
 
				+		message = malloc(256);
			
 
				+		sprintf(message,
			
 
				+				"FLOAT;%d;%s;%f;%f;%f\n",
			
 
				+				param->id,
			
 
				+				param->name,
			
 
				+				param->double_min_value,
			
 
				+				param->double_max_value,
			
 
				+				*(double*)(param->value));
			
 
				+		break;
			
 
				+	case STARPUTOP_PARAM_ENUM:
			
 
				+		//compute message lenght
			
 
				+		for(i = 0; i < param->nb_values; i++)
			
 
				+		{
			
 
				+			length += strlen(param->enum_values[i])+1;
			
 
				+		}
			
 
				+		message = malloc(256+length);
			
 
				+		sprintf(message,
			
 
				+				"ENUM;%d;%s;",
			
 
				+				param->id,
			
 
				+				param->name);
			
 
				+		
			
 
				+		//compute the begin of enums elements in message
			
 
				+		char* cur = message+strlen(message);
			
 
				+		//add each enum element
			
 
				+		for(i = 0; i < param->nb_values; i++)
			
 
				+		{
			
 
				+			strcpy(cur, param->enum_values[i]);
			
 
				+			cur+=strlen(cur);
			
 
				+			*cur=';';
			
 
				+			cur++;
			
 
				+		}
			
 
				+		sprintf(cur,
			
 
				+				"%d\n",
			
 
				+				*((int*)(param->value)));
			
 
				+		break;
			
 
				+	}
			
 
				+	return message;
			
 
				+}
			
 
				+
			
 
				+void starputop_enqueue_param(starputop_param* param)
			
 
				+{
			
 
				+	if(starputop_first_param == NULL)
			
 
				+	{
			
 
				+		starputop_first_param = param;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starputop_param * cur = starputop_first_param;
			
 
				+		while(cur->next != NULL)
			
 
				+			cur = cur->next;
			
 
				+		cur->next = param;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+starputop_param* starputop_register_parameter_boolean(
			
 
				+			const char* param_name,
			
 
				+			int* parameter_field,
			
 
				+			void (*callback)(struct starputop_param_t*))
			
 
				+{
			
 
				+    STARPU_ASSERT(!starpu_top_status_get());
			
 
				+	starputop_param * param = malloc(sizeof(starputop_param));
			
 
				+	param->callback = callback;
			
 
				+	param->name = param_name;
			
 
				+	param->id = starputop_param_cpt++;
			
 
				+	param->type = STARPUTOP_PARAM_BOOLEAN;
			
 
				+	param->value = (void*)parameter_field;
			
 
				+	param->next = NULL;
			
 
				+	
			
 
				+	starputop_enqueue_param(param);
			
 
				+	
			
 
				+	return param;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+starputop_param* starputop_register_parameter_integer(const char* param_name,
			
 
				+			int* parameter_field,
			
 
				+			int minimum_value,
			
 
				+			int maximum_value,
			
 
				+			void (*callback)(struct starputop_param_t*))
			
 
				+{	
			
 
				+	STARPU_ASSERT(!starpu_top_status_get());
			
 
				+	starputop_param * param = malloc(sizeof(starputop_param));
			
 
				+	param->callback = callback;
			
 
				+	param->name = param_name;
			
 
				+	param->id = starputop_param_cpt++;
			
 
				+	param->type = STARPUTOP_PARAM_INTEGER;
			
 
				+	param->value = (void*)parameter_field;
			
 
				+	param->int_min_value = minimum_value;
			
 
				+	param->int_max_value = maximum_value;
			
 
				+	param->next = NULL;
			
 
				+
			
 
				+	starputop_enqueue_param(param);
			
 
				+	
			
 
				+	return param;
			
 
				+}
			
 
				+starputop_param* starputop_register_parameter_float(
			
 
				+			const char* param_name,
			
 
				+			double* parameter_field,
			
 
				+			double minimum_value,
			
 
				+			double maximum_value,
			
 
				+			void (*callback)(struct starputop_param_t*))
			
 
				+{
			
 
				+	STARPU_ASSERT(!starpu_top_status_get());
			
 
				+	starputop_param * param = malloc(sizeof(starputop_param));
			
 
				+	param->callback = callback;
			
 
				+	param->name = param_name;
			
 
				+	param->id = starputop_param_cpt++;
			
 
				+	param->type = STARPUTOP_PARAM_FLOAT;
			
 
				+	param->value = (void*)parameter_field;
			
 
				+	param->double_min_value = minimum_value;
			
 
				+	param->double_max_value = maximum_value;
			
 
				+	param->next = NULL;
			
 
				+
			
 
				+	starputop_enqueue_param(param);
			
 
				+
			
 
				+	return param;
			
 
				+}
			
 
				+
			
 
				+starputop_param* starputop_register_parameter_enum(
			
 
				+			const char* param_name,
			
 
				+			int* parameter_field,
			
 
				+			char** values,
			
 
				+			int nb_values,
			
 
				+			void (*callback)(struct starputop_param_t*))
			
 
				+{
			
 
				+	STARPU_ASSERT(!starpu_top_status_get());
			
 
				+	starputop_param * param = malloc(sizeof(starputop_param));
			
 
				+	param->callback = callback;
			
 
				+	param->name = param_name;
			
 
				+	param->id = starputop_param_cpt++;
			
 
				+	param->type = STARPUTOP_PARAM_ENUM;
			
 
				+	param->value = (void*)parameter_field;
			
 
				+	param->enum_values = values;
			
 
				+	param->nb_values = nb_values;
			
 
				+	param->next = NULL;
			
 
				+	
			
 
				+	starputop_enqueue_param(param);
			
 
				+
			
 
				+	return param;
			
 
				+}
			
 
				+/*********************************************
			
 
				+*****************UPDATE FUNC******************
			
 
				+**********************************************/
			
 
				+
			
 
				+void starputop_update_data_boolean(const starputop_data* data, int value){
			
 
				+	if (!starpu_top_status_get())
			
 
				+		return;
			
 
				+	if(data->active)
			
 
				+	{
			
 
				+		char*message = malloc(256+strlen(data->name));
			
 
				+		sprintf(message,
			
 
				+				"U;%d;%d;%lld\n",
			
 
				+				data->id,
			
 
				+				(value?1:0),
			
 
				+				current_timestamp());
			
 
				+		starputop_message_add(starputop_mt,message);
			
 
				+	}
			
 
				+}
			
 
				+void starputop_update_data_integer(const starputop_data* data,int value){
			
 
				+	if (!starpu_top_status_get())
			
 
				+		return;
			
 
				+	if(data->active)
			
 
				+	{
			
 
				+		char*message = malloc(256+strlen(data->name));
			
 
				+		sprintf(message,
			
 
				+				"U;%d;%d;%lld\n",
			
 
				+				data->id,
			
 
				+				value,
			
 
				+				current_timestamp());
			
 
				+		starputop_message_add(starputop_mt,message);
			
 
				+	}
			
 
				+}
			
 
				+void starputop_update_data_float(const starputop_data* data, double value){
			
 
				+	if (!starpu_top_status_get())
			
 
				+		return;
			
 
				+	if(data->active)
			
 
				+	{
			
 
				+		char*message = malloc(256+strlen(data->name));
			
 
				+		sprintf(message,
			
 
				+				"U;%d;%f;%lld\n",
			
 
				+				data->id, value,
			
 
				+				current_timestamp());
			
 
				+		starputop_message_add(starputop_mt,message);
			
 
				+	}
			
 
				+}
			
 
				+void starputop_update_parameter(const starputop_param* param){
			
 
				+	if (!starpu_top_status_get())
			
 
				+		return;
			
 
				+	char*message = malloc(50);
			
 
				+
			
 
				+	switch(param->type)
			
 
				+	{
			
 
				+		case STARPUTOP_PARAM_BOOLEAN:
			
 
				+		case STARPUTOP_PARAM_INTEGER:
			
 
				+		case STARPUTOP_PARAM_ENUM:
			
 
				+			sprintf(message,
			
 
				+					"SET;%d;%d;%lld\n",
			
 
				+					param->id,
			
 
				+					*((int*)param->value),
			
 
				+					current_timestamp());
			
 
				+			break;
			
 
				+		
			
 
				+		case STARPUTOP_PARAM_FLOAT:
			
 
				+			sprintf(message,
			
 
				+					"SET;%d;%f;%lld\n",
			
 
				+					param->id,
			
 
				+					*((double*)param->value),
			
 
				+					current_timestamp());
			
 
				+			break;
			
 
				+	}
			
 
				+	
			
 
				+	starputop_message_add(starputop_mt,message);	
			
 
				+}
			
 
				+
			
 
				+/*********************************************
			
 
				+*****************DEBUG FUNC******************
			
 
				+**********************************************/
			
 
				+
			
 
				+void starputop_debug_log(const char* debug_message)
			
 
				+{
			
 
				+	if(starputop_debug_on)
			
 
				+	{
			
 
				+		//length can be up to strlen*2, if message contains only unwanted chars
			
 
				+		char * message = malloc(strlen(debug_message)*2+16);
			
 
				+		sprintf(message,"MESSAGE;");
			
 
				+		
			
 
				+		//escape unwanted char : ; and \n
			
 
				+		char* cur = message+8;
			
 
				+		while(*debug_message!='\0')
			
 
				+		{
			
 
				+			if(*debug_message=='\n' || *debug_message==';')
			
 
				+			{
			
 
				+				*cur='\\';
			
 
				+				cur++;
			
 
				+			}
			
 
				+			*cur = *debug_message;
			
 
				+			cur++;
			
 
				+			debug_message++;
			
 
				+		}
			
 
				+		*cur='\n';
			
 
				+		cur++;
			
 
				+		*cur='\0';
			
 
				+
			
 
				+		starputop_message_add(starputop_mt,message);
			
 
				+	}
			
 
				+}
			
 
				+void starputop_debug_lock(const char* debug_message)
			
 
				+{
			
 
				+	if(starputop_debug_on)
			
 
				+	{
			
 
				+		char * message = malloc(strlen(debug_message)*2+16);
			
 
				+		sprintf(message,"LOCK;");
			
 
				+		char* cur = message+5;
			
 
				+		while(*debug_message!='\0')
			
 
				+		{
			
 
				+			if(*debug_message=='\n' || *debug_message==';')
			
 
				+			{
			
 
				+				*cur='\\';
			
 
				+				cur++;
			
 
				+			}
			
 
				+			*cur = *debug_message;
			
 
				+			cur++;
			
 
				+			debug_message++;
			
 
				+		}
			
 
				+		*cur='\n';
			
 
				+		*(cur+1)='\0';
			
 
				+
			
 
				+		starputop_message_add(starputop_mt,message);
			
 
				+
			
 
				+		//This threads keeps locked while we don't receive an STEP message
			
 
				+		pthread_mutex_lock(&starputop_wait_for_continue_mutex);
			
 
				+		pthread_cond_wait(&starputop_wait_for_continue_cond,&starputop_wait_for_continue_mutex);
			
 
				+		pthread_mutex_unlock(&starputop_wait_for_continue_mutex);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+ 
			
 
				+ 
			
 
				+/********************************************
			
 
				+ **************TIME FUNCTION****************
			
 
				+ *******************************************/
			
 
				+
			
 
				+unsigned long long int current_timestamp()
			
 
				+{
			
 
				+	struct timespec now;
			
 
				+	starpu_clock_gettime(&now);
			
 
				+	return starpu_timing_timespec_to_ms(&now);
			
 
				+}
			
 
				+
			
 
				+unsigned long long starpu_timing_timespec_to_ms(const struct timespec *ts)
			
 
				+{
			
 
				+  return (1000.0*ts->tv_sec) + (0.000001*ts->tv_nsec);
			
 
				+}
			
 
				+
			
 
				+/********************************************
			
 
				+ **************INPUT PROCESSING**************
			
 
				+ *******************************************/
			
 
				+
			
 
				+starputop_message_type starputop_get_message_type(const char* message)
			
 
				+{
			
 
				+	if(!strncmp("GO\n", message,3))
			
 
				+		return TOP_TYPE_GO;
			
 
				+	else if(!strncmp("SET;", message,4))
			
 
				+		return TOP_TYPE_SET;
			
 
				+	else if(!strncmp("STEP\n", message,9))
			
 
				+		return TOP_TYPE_CONTINUE;
			
 
				+	else if(!strncmp("ENABLE;", message,7))
			
 
				+		return TOP_TYPE_ENABLE;
			
 
				+	else if(!strncmp("DISABLE;", message,8))
			
 
				+		return TOP_TYPE_DISABLE;
			
 
				+	else if(!strncmp("DEBUG;", message,6))
			
 
				+		return TOP_TYPE_DEBUG;
			
 
				+	else 
			
 
				+		return TOP_TYPE_UNKNOW;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void starputop_unlock_starpu()
			
 
				+{
			
 
				+	sem_post(&starputop_wait_for_go);
			
 
				+	printf("%s:%d starpu started\n", __FILE__, __LINE__);
			
 
				+}
			
 
				+
			
 
				+void starputop_change_data_active(char* message, int active)
			
 
				+{
			
 
				+	char* debut = strstr(message, ";")+1;
			
 
				+	char* fin = strstr(debut+1, "\n");
			
 
				+	*fin = '\0';
			
 
				+	int data_id = atoi(debut);
			
 
				+	printf("%s:%d data %d %s\n", __FILE__, __LINE__, data_id, active ? "ENABLED" : "DISABLE");
			
 
				+	starputop_datas[data_id]->active = active;
			
 
				+}
			
 
				+
			
 
				+void starputop_change_parameter_value(const char* message){
			
 
				+	const char*tmp = strstr(message, ";")+1;
			
 
				+	int param_id = atoi(tmp);
			
 
				+	starputop_param* param = starputop_params[param_id];
			
 
				+	tmp = strstr(tmp+1,";")+1;
			
 
				+	int* val_ptr_int;
			
 
				+	double* val_ptr_double;
			
 
				+
			
 
				+	switch(param->type)
			
 
				+	{
			
 
				+		case STARPUTOP_PARAM_BOOLEAN:
			
 
				+		case STARPUTOP_PARAM_INTEGER:
			
 
				+			val_ptr_int = (int*)param->value;
			
 
				+			*val_ptr_int = atoi(tmp);
			
 
				+		break;
			
 
				+		
			
 
				+		case STARPUTOP_PARAM_FLOAT:
			
 
				+			val_ptr_double = (double*)param->value;
			
 
				+			*val_ptr_double = atof(tmp);
			
 
				+		break;
			
 
				+
			
 
				+		case STARPUTOP_PARAM_ENUM:
			
 
				+			val_ptr_int = (int*)param->value;
			
 
				+			*val_ptr_int = atoi(tmp);
			
 
				+		break;
			
 
				+		
			
 
				+	}
			
 
				+	if(param->callback != NULL)
			
 
				+		param->callback(param);
			
 
				+}
			
 
				+
			
 
				+void starputop_change_debug_mode(const char*message)
			
 
				+{
			
 
				+	const char* debut = strstr(message, ";")+1;
			
 
				+	if(!strncmp("ON",debut, 2))
			
 
				+	{
			
 
				+		starputop_debug_on = 1;
			
 
				+		printf("%s:%d debug is now ON\n", __FILE__, __LINE__);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		starputop_debug_on = 0;
			
 
				+		printf("%s:%d debug is now OFF\n", __FILE__, __LINE__);
			
 
				+	}
			
 
				+
			
 
				+	char * m = malloc(strlen(message)+1);
			
 
				+	sprintf(m,"%s",message);
			
 
				+	starputop_message_add(starputop_mt,m);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Unlock starpu if it was locked in debug state
			
 
				+*/
			
 
				+void starputop_debug_next_step()
			
 
				+{
			
 
				+	pthread_cond_signal(&starputop_wait_for_continue_cond);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void starputop_process_input_message(char *buffer)
			
 
				+{
			
 
				+	starputop_message_type message_type = starputop_get_message_type(buffer);
			
 
				+	switch(message_type)
			
 
				+	{
			
 
				+		case TOP_TYPE_GO:
			
 
				+			starputop_unlock_starpu();
			
 
				+		break;
			
 
				+		case TOP_TYPE_ENABLE:
			
 
				+			starputop_change_data_active(buffer, 1);
			
 
				+		break;
			
 
				+		case TOP_TYPE_DISABLE:
			
 
				+			starputop_change_data_active(buffer, 0);
			
 
				+		break;
			
 
				+		case TOP_TYPE_SET:
			
 
				+			starputop_change_parameter_value(buffer);
			
 
				+		break;
			
 
				+		case TOP_TYPE_DEBUG:
			
 
				+			starputop_change_debug_mode(buffer);
			
 
				+		break;
			
 
				+		case TOP_TYPE_CONTINUE:
			
 
				+			starputop_debug_next_step();
			
 
				+		break;
			
 
				+		default:
			
 
				+			printf("%s:%d unknow message : '%s'\n", __FILE__, __LINE__, buffer);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
--- a/src/top/starputop_connection.c
+++ b/src/top/starputop_connection.c
@@ -0,0 +1,168 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
			
 
				+ * Roy
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+
			
 
				+#ifdef STARPU_HAVE_WINDOWS
			
 
				+#  include <w32api.h>
			
 
				+#  define WINVER WindowsXP
			
 
				+#  include <ws2tcpip.h>
			
 
				+#else
			
 
				+#  include <sys/socket.h>
			
 
				+#  include <netinet/in.h>
			
 
				+#  include <netdb.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <top/starputop_connection.h>
			
 
				+#include <top/starputop_message_queue.h>
			
 
				+#include <starpu_top.h>
			
 
				+#include <pthread.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+const char *STARPUTOP_PORT = "2011";
			
 
				+const int STARPUTOP_BUFFER_SIZE=1024;
			
 
				+
			
 
				+extern starputop_message_queue_t*  starputop_mt;
			
 
				+
			
 
				+//client socket after fopen
			
 
				+FILE* starputop_socket_fd_read;
			
 
				+FILE* starputop_socket_fd_write;
			
 
				+//client socket (file descriptor)
			
 
				+int starputop_socket_fd;
			
 
				+
			
 
				+
			
 
				+void * message_from_ui(void * p)
			
 
				+{
			
 
				+	(void) p;
			
 
				+	char str[STARPUTOP_BUFFER_SIZE];
			
 
				+	while(1)
			
 
				+	{
			
 
				+		char * check=fgets (str, STARPUTOP_BUFFER_SIZE, starputop_socket_fd_read);
			
 
				+
			
 
				+		printf("Message from UI : %s",str);
			
 
				+		if (check)
			
 
				+		{
			
 
				+			starputop_process_input_message(str);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			fprintf(stderr,"Connection dropped\n");
			
 
				+			//unlocking StarPU.
			
 
				+			starputop_process_input_message("GO\n");
			
 
				+			starputop_process_input_message("DEBUG;OFF\n");
			
 
				+			starputop_process_input_message("STEP\n");
			
 
				+			return NULL;
			
 
				+		}
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+void * message_to_ui(void * p)
			
 
				+{
			
 
				+	(void) p;
			
 
				+	while(1)
			
 
				+	{
			
 
				+		char* message = starputop_message_remove(starputop_mt);
			
 
				+		int len=strlen(message);
			
 
				+		int check=fwrite(message, sizeof(char), len, starputop_socket_fd_write);
			
 
				+		int check2=fflush(starputop_socket_fd_write);
			
 
				+		free(message);
			
 
				+		if (check!=len || check2==EOF )
			
 
				+		{
			
 
				+			fprintf(stderr,"Connection dropped : message no longer send\n");
			
 
				+			while(1)
			
 
				+			{
			
 
				+				message=starputop_message_remove(starputop_mt);
			
 
				+				free(message);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+void starputop_communications_threads_launcher()
			
 
				+{
			
 
				+	pthread_t from_ui;
			
 
				+	pthread_t to_ui;
			
 
				+	pthread_attr_t threads_attr;
			
 
				+
			
 
				+  
			
 
				+	//Connection to UI & Socket Initilization
			
 
				+	printf("%s:%d Connection to UI initilization\n",__FILE__, __LINE__);
			
 
				+	struct sockaddr_storage from;
			
 
				+	struct addrinfo req, *ans;
			
 
				+	int code;
			
 
				+	req.ai_flags = AI_PASSIVE;
			
 
				+	req.ai_family = PF_UNSPEC;            
			
 
				+	req.ai_socktype = SOCK_STREAM;
			
 
				+	req.ai_protocol = 0;  
			
 
				+  
			
 
				+	if ((code = getaddrinfo(NULL, STARPUTOP_PORT, &req, &ans)) != 0)
			
 
				+	{
			
 
				+		fprintf(stderr, " getaddrinfo failed %d\n", code);
			
 
				+		exit(EXIT_FAILURE);
			
 
				+   	}
			
 
				+  	int sock=socket(ans->ai_family, ans->ai_socktype, ans->ai_protocol);
			
 
				+	int optval = 1;
			
 
				+	setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval));
			
 
				+
			
 
				+	if (bind(sock, ans->ai_addr, ans->ai_addrlen) < 0)
			
 
				+	{
			
 
				+		perror("bind");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+
			
 
				+	listen(sock, 2);
			
 
				+
			
 
				+	socklen_t len = sizeof(from);
			
 
				+
			
 
				+   	if ((starputop_socket_fd=accept(sock, (struct sockaddr *) &from, &len)) ==-1)
			
 
				+	{
			
 
				+		fprintf(stderr, "accept error\n");
			
 
				+		perror("accept");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+	
			
 
				+	if ( (starputop_socket_fd_read=fdopen(starputop_socket_fd, "r")) == NULL)
			
 
				+	{
			
 
				+		perror("fdopen");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+
			
 
				+	starputop_socket_fd=dup(starputop_socket_fd);
			
 
				+	
			
 
				+	if ((starputop_socket_fd_write=fdopen(starputop_socket_fd, "w")) == NULL)
			
 
				+	{
			
 
				+		perror("fdopen");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+	
			
 
				+	close(sock);
			
 
				+	
			
 
				+	//Threads creation
			
 
				+	fprintf(stderr,"Threads Creation\n");
			
 
				+	pthread_attr_init(&threads_attr);
			
 
				+	pthread_attr_setdetachstate(&threads_attr, PTHREAD_CREATE_DETACHED);
			
 
				+	
			
 
				+	pthread_create(&from_ui, &threads_attr, message_from_ui, NULL);
			
 
				+	pthread_create(&to_ui, &threads_attr, message_to_ui, NULL);
			
 
				+}
			
 
				+
			
--- a/src/top/starputop_connection.h
+++ b/src/top/starputop_connection.h
@@ -0,0 +1,44 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
			
 
				+ * Roy
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPUTOP_CONNECTION_H__
			
 
				+#define __STARPUTOP_CONNECTION_H__
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <top/starputop_message_queue.h>
			
 
				+#include <starpu_top.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  extern starputop_message_queue_t*  starputop_mt;
			
 
				+
			
 
				+/*
			
 
				+ * This function initialize the two communications threads.
			
 
				+ * It initializes the connection and then launches the threads.
			
 
				+ * The function wait the UI connection before launching the threads.
			
 
				+ * About mt : mt MUST be allocated before call. 
			
 
				+ * All messages in the queue are freed after used. 
			
 
				+ */
			
 
				+  void starputop_communications_threads_launcher();
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif // __STARPUTOP_CONNECTION_H__
			
 
				+
			
--- a/src/top/starputop_message_queue.c
+++ b/src/top/starputop_message_queue.c
@@ -0,0 +1,109 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
			
 
				+ * Roy
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include  "starputop_message_queue.h"
			
 
				+#include  <string.h>
			
 
				+#include  <stdio.h>
			
 
				+#include  <stdlib.h>
			
 
				+
			
 
				+//this global queue is used both by API and by network threads
			
 
				+starputop_message_queue_t*  starputop_mt = NULL;
			
 
				+
			
 
				+
			
 
				+/* Will always return the pointer to starputop_message_queue */
			
 
				+starputop_message_queue_t* starputop_message_add(
			
 
				+			starputop_message_queue_t* s,
			
 
				+			char* msg)
			
 
				+{
			
 
				+	starputop_message_queue_item_t* p = malloc( 1 * sizeof(*p) );
			
 
				+	pthread_mutex_lock(&(s->mutex));
			
 
				+	if( NULL == p )
			
 
				+	{
			
 
				+		fprintf(stderr, "IN %s, %s: malloc() failed\n", __FILE__, "list_add");
			
 
				+		pthread_mutex_unlock(&(s->mutex));
			
 
				+		return s;
			
 
				+	}
			
 
				+
			
 
				+	p->message = msg;
			
 
				+	p->next = NULL;
			
 
				+
			
 
				+	if( NULL == s )
			
 
				+	{
			
 
				+		printf("Queue not initialized\n");
			
 
				+		pthread_mutex_unlock(&(s->mutex));
			
 
				+		return s;
			
 
				+	}
			
 
				+	else if( NULL == s->head && NULL == s->tail )
			
 
				+	{
			
 
				+		/* printf("Empty list, adding p->num: %d\n\n", p->num);  */
			
 
				+		sem_post(&(s->semaphore));
			
 
				+		s->head = s->tail = p;
			
 
				+		pthread_mutex_unlock(&(s->mutex));
			
 
				+		return s;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* printf("List not empty, adding element to tail\n"); */
			
 
				+		sem_post(&(s->semaphore));
			
 
				+		s->tail->next = p;
			
 
				+		s->tail = p;
			
 
				+	}
			
 
				+	pthread_mutex_unlock(&(s->mutex));
			
 
				+	return s;
			
 
				+}
			
 
				+
			
 
				+//this is a queue and it is FIFO, so we will always remove the first element
			
 
				+char* starputop_message_remove(starputop_message_queue_t* s)
			
 
				+{
			
 
				+	sem_wait(&(s->semaphore));
			
 
				+	starputop_message_queue_item_t* h = NULL;
			
 
				+	starputop_message_queue_item_t* p = NULL;
			
 
				+
			
 
				+	if( NULL == s )
			
 
				+	{
			
 
				+		printf("List is null\n");
			
 
				+		return NULL;
			
 
				+	}
			
 
				+	pthread_mutex_lock(&(s->mutex));
			
 
				+	h = s->head;
			
 
				+	p = h->next;
			
 
				+	char* value = h->message;
			
 
				+	free(h);
			
 
				+	s->head = p;
			
 
				+
			
 
				+	
			
 
				+	if( NULL == s->head )
			
 
				+		//the element tail was pointing to is free(), so we need an update
			
 
				+		s->tail = s->head;
			
 
				+	pthread_mutex_unlock(&(s->mutex));
			
 
				+	return value;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+starputop_message_queue_t* starputop_message_queue_new(void)
			
 
				+{
			
 
				+	starputop_message_queue_t* p = malloc( 1 * sizeof(*p));
			
 
				+	if( NULL == p )
			
 
				+	{
			
 
				+		fprintf(stderr, "LINE: %d, malloc() failed\n", __LINE__);
			
 
				+	}
			
 
				+
			
 
				+	p->head = p->tail = NULL;
			
 
				+	sem_init(&(p->semaphore),0,0);
			
 
				+	pthread_mutex_init(&(p->mutex), NULL);
			
 
				+	return p;
			
 
				+}
			
--- a/src/top/starputop_message_queue.h
+++ b/src/top/starputop_message_queue.h
@@ -0,0 +1,50 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
			
 
				+ * Roy
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <sys/types.h>
			
 
				+#include <semaphore.h> 
			
 
				+#include <pthread.h>
			
 
				+
			
 
				+#ifndef __STARPUTOP_MESSAGE_QUEUE_H__
			
 
				+#define __STARPUTOP_MESSAGE_QUEUE_H__
			
 
				+
			
 
				+typedef struct starputop_message_queue_item
			
 
				+{
			
 
				+	char *message;
			
 
				+	struct starputop_message_queue_item* next;
			
 
				+} starputop_message_queue_item_t;
			
 
				+
			
 
				+typedef struct starputop_message_queue
			
 
				+{
			
 
				+	struct starputop_message_queue_item* head;
			
 
				+	struct starputop_message_queue_item* tail;
			
 
				+	sem_t semaphore;
			
 
				+	pthread_mutex_t mutex;
			
 
				+} starputop_message_queue_t;
			
 
				+
			
 
				+
			
 
				+starputop_message_queue_t *starputop_message_add(
			
 
				+			starputop_message_queue_t*,
			
 
				+			char*);
			
 
				+
			
 
				+char* starputop_message_remove(starputop_message_queue_t*);
			
 
				+
			
 
				+starputop_message_queue_t* starputop_message_queue_new();
			
 
				+starputop_message_queue_t* starputop_message_queue_free(
			
 
				+			starputop_message_queue_t*);
			
 
				+
			
 
				+#endif
			
--- a/src/top/starputop_task.c
+++ b/src/top/starputop_task.c
@@ -0,0 +1,97 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011 William Braik, Yann Courtois, Jean-Marie Couteyen, Anthony
			
 
				+ * Roy
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu_top.h>
			
 
				+#include <top/starputop_message_queue.h>
			
 
				+#include <top/starputop_connection.h>
			
 
				+#include <core/task.h>
			
 
				+#include <stdio.h>
			
 
				+#include <string.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <common/timing.h>
			
 
				+
			
 
				+/********************************************
			
 
				+ **************TASK RELATED FUNCTIONS********
			
 
				+ *******************************************/
			
 
				+
			
 
				+void starputop_task_started(
			
 
				+			struct starpu_task *task, 
			
 
				+			int devid, 
			
 
				+			const struct timespec *ts)
			
 
				+{
			
 
				+	unsigned long long taskid = _starpu_get_job_associated_to_task(task)->job_id;
			
 
				+	STARPU_ASSERT(starpu_top_status_get());
			
 
				+	char *str = malloc(sizeof(char)*64);
			
 
				+	snprintf(str, 64,
			
 
				+				"START;%llu;%d;%llu\n",
			
 
				+				taskid, 
			
 
				+				devid, 
			
 
				+				starpu_timing_timespec_to_ms(ts));
			
 
				+
			
 
				+	starputop_message_add(starputop_mt, str);
			
 
				+}
			
 
				+
			
 
				+void starputop_task_ended(
			
 
				+			struct starpu_task *task, 
			
 
				+			int devid, 
			
 
				+			const struct timespec *ts)
			
 
				+{
			
 
				+	unsigned long long taskid = _starpu_get_job_associated_to_task(task)->job_id;
			
 
				+	(void) devid; //unused
			
 
				+	STARPU_ASSERT(starpu_top_status_get());
			
 
				+	char *str = malloc(sizeof(char)*64);
			
 
				+	snprintf(str, 64,
			
 
				+				"END;%llu;%llu\n", 
			
 
				+				taskid, 
			
 
				+				starpu_timing_timespec_to_ms(ts));
			
 
				+
			
 
				+	starputop_message_add(starputop_mt, str);
			
 
				+}
			
 
				+
			
 
				+void starputop_task_prevision_timespec(
			
 
				+			struct starpu_task *task,
			
 
				+			int devid, 
			
 
				+			const struct timespec* start, 
			
 
				+			const struct timespec* end)
			
 
				+{
			
 
				+	starputop_task_prevision(task, 
			
 
				+							devid, 
			
 
				+							starpu_timing_timespec_to_ms(start),
			
 
				+							starpu_timing_timespec_to_ms(end));
			
 
				+}
			
 
				+
			
 
				+void starputop_task_prevision(
			
 
				+			struct starpu_task *task, 
			
 
				+			int devid, 
			
 
				+			unsigned long long start, 
			
 
				+			unsigned long long end)
			
 
				+{
			
 
				+	unsigned long long taskid = _starpu_get_job_associated_to_task(task)->job_id;
			
 
				+	STARPU_ASSERT(starpu_top_status_get());
			
 
				+	struct timespec now;
			
 
				+	starpu_clock_gettime(&now);
			
 
				+	char * str=malloc(sizeof(char)*200);
			
 
				+	snprintf(str, 128, 
			
 
				+				"PREV;%llu;%d;%llu;%llu;%llu\n",
			
 
				+				taskid,
			
 
				+				devid,
			
 
				+				starpu_timing_timespec_to_ms(&now),
			
 
				+				start,
			
 
				+				end);
			
 
				+
			
 
				+	starputop_message_add(starputop_mt, str);
			
 
				+}
			
--- a/src/util/malloc.c
+++ b/src/util/malloc.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,7 +31,7 @@ struct malloc_pinned_codelet_struct {
 
				 #endif
			
 
				 
			
 
				 //#ifdef STARPU_USE_OPENCL
			
 
				-//static void malloc_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				+//static void malloc_pinned_opencl_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 //{
			
 
				 //	struct malloc_pinned_codelet_struct *s = arg;
			
 
				 //        //        *(s->ptr) = malloc(s->dim);
			
@@ -40,7 +40,7 @@ struct malloc_pinned_codelet_struct {
 
				 //#endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void malloc_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				+static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 {
			
 
				 	struct malloc_pinned_codelet_struct *s = arg;
			
 
				 
			
@@ -67,7 +67,7 @@ static starpu_codelet malloc_pinned_cl = {
 
				 };
			
 
				 #endif
			
 
				 
			
 
				-int starpu_data_malloc_pinned_if_possible(void **A, size_t dim)
			
 
				+int starpu_malloc(void **A, size_t dim)
			
 
				 {
			
 
				 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				 		return -EDEADLK;
			
@@ -132,7 +132,7 @@ int starpu_data_malloc_pinned_if_possible(void **A, size_t dim)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void free_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				+static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 {
			
 
				 	cudaError_t cures;
			
 
				 	cures = cudaFreeHost(arg);
			
@@ -142,7 +142,7 @@ static void free_pinned_cuda_codelet(void *buffers[] __attribute__((unused)), vo
 
				 #endif
			
 
				 
			
 
				 //#ifdef STARPU_USE_OPENCL
			
 
				-//static void free_pinned_opencl_codelet(void *buffers[] __attribute__((unused)), void *arg)
			
 
				+//static void free_pinned_opencl_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 //{
			
 
				 //        //        free(arg);
			
 
				 //        int err = clReleaseMemObject(arg);
			
@@ -166,7 +166,7 @@ static starpu_codelet free_pinned_cl = {
 
				 };
			
 
				 #endif
			
 
				 
			
 
				-int starpu_data_free_pinned_if_possible(void *A)
			
 
				+int starpu_free(void *A)
			
 
				 {
			
 
				 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
			
 
				 		return -EDEADLK;
			
--- a/src/util/starpu_cublas.c
+++ b/src/util/starpu_cublas.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,16 +20,18 @@
 
				 #include <common/config.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static void init_cublas_func(void *args __attribute__((unused)))
			
 
				+static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	cublasStatus cublasst = cublasInit();
			
 
				 	if (STARPU_UNLIKELY(cublasst))
			
 
				 		STARPU_CUBLAS_REPORT_ERROR(cublasst);
			
 
				 
			
 
				+#if CUDA_VERSION >= 3010
			
 
				 	cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				-static void shutdown_cublas_func(void *args __attribute__((unused)))
			
 
				+static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	cublasShutdown();
			
 
				 }
			
--- a/src/util/starpu_insert_task.c
+++ b/src/util/starpu_insert_task.c
@@ -62,7 +62,7 @@ void starpu_unpack_cl_args(void *_cl_arg, ...)
 
				 	va_end(varg_list);
			
 
				 }
			
 
				 
			
 
				-void starpu_insert_task(starpu_codelet *cl, ...)
			
 
				+int starpu_insert_task(starpu_codelet *cl, ...)
			
 
				 {
			
 
				 	va_list varg_list;
			
 
				 
			
@@ -77,5 +77,5 @@ void starpu_insert_task(starpu_codelet *cl, ...)
 
				 
			
 
				 	va_start(varg_list, cl);
			
 
				         struct starpu_task *task = starpu_task_create();
			
 
				-        _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
			
 
				+        return _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
			
 
				 }
			
--- a/src/util/starpu_insert_task_utils.c
+++ b/src/util/starpu_insert_task_utils.c
@@ -27,6 +27,7 @@ struct insert_task_cb_wrapper {
 
				 	void *arg_stack;
			
 
				 };
			
 
				 
			
 
				+static
			
 
				 void starpu_task_insert_callback_wrapper(void *_cl_arg_wrapper)
			
 
				 {
			
 
				 	struct insert_task_cb_wrapper *cl_arg_wrapper = _cl_arg_wrapper;
			
@@ -68,9 +69,12 @@ size_t _starpu_insert_task_get_arg_size(va_list varg_list)
 
				 		else if (arg_type==STARPU_PRIORITY) {
			
 
				 			va_arg(varg_list, int);
			
 
				 		}
			
 
				-		else if (arg_type==STARPU_EXECUTE) {
			
 
				+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
			
 
				 			va_arg(varg_list, int);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
			
 
				+			va_arg(varg_list, starpu_data_handle);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	va_end(varg_list);
			
@@ -122,18 +126,23 @@ int _starpu_pack_cl_args(size_t arg_buffer_size, char **arg_buffer, va_list varg
 
				 		{
			
 
				 			va_arg(varg_list, int);
			
 
				 		}
			
 
				-		else if (arg_type==STARPU_EXECUTE) {
			
 
				+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
			
 
				 			va_arg(varg_list, int);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
			
 
				+			va_arg(varg_list, starpu_data_handle);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	(*arg_buffer)[0] = nargs;
			
 
				 	va_end(varg_list);
			
 
				 	return 0;
			
 
				 }
			
 
				-static void _starpu_prepare_task(char *arg_buffer, starpu_codelet *cl, struct starpu_task **task, va_list varg_list, unsigned *ctx) {
			
 
				-        int arg_type;
			
 
				+
			
 
				+int _starpu_insert_task_create_and_submit(char *arg_buffer, starpu_codelet *cl, struct starpu_task **task, va_list varg_list) {
			
 
				+    int arg_type;
			
 
				 	unsigned current_buffer = 0;
			
 
				+	unsigned ctx = 0;
			
 
				 
			
 
				 	struct insert_task_cb_wrapper *cl_arg_wrapper = malloc(sizeof(struct insert_task_cb_wrapper));
			
 
				 	STARPU_ASSERT(cl_arg_wrapper);
			
@@ -176,9 +185,13 @@ static void _starpu_prepare_task(char *arg_buffer, starpu_codelet *cl, struct st
 
				 			int prio = va_arg(varg_list, int); 
			
 
				 			(*task)->priority = prio;
			
 
				 		}
			
 
				-		else if (arg_type==STARPU_EXECUTE) {
			
 
				+		else if (arg_type==STARPU_EXECUTE_ON_NODE) {
			
 
				 			va_arg(varg_list, int);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
			
 
				+			va_arg(varg_list, starpu_data_handle);
			
 
				+		}
			
 
				+
			
 
				 		else if (arg_type==STARPU_CTX) {
			
 
				 			*ctx = va_arg(varg_list, unsigned);
			
 
				 		}
			
@@ -196,16 +209,11 @@ static void _starpu_prepare_task(char *arg_buffer, starpu_codelet *cl, struct st
 
				 	 * application's callback, if any. */
			
 
				 	(*task)->callback_func = starpu_task_insert_callback_wrapper;
			
 
				 	(*task)->callback_arg = cl_arg_wrapper;
			
 
				-}
			
 
				 
			
 
				-int _starpu_insert_task_create_and_submit(char *arg_buffer, starpu_codelet *cl, struct starpu_task **task, va_list varg_list) {
			
 
				-	unsigned ctx = 0;
			
 
				-	_starpu_prepare_task(arg_buffer, cl, task, varg_list, &ctx);
			
 
				 	 int ret = ctx == 0 ? starpu_task_submit(*task) : starpu_task_submit_to_ctx(*task, ctx);
			
 
				 
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-          fprintf(stderr, "No one can execute task %p wih cl %p (symbol %s)\n", *task, (*task)->cl, ((*task)->cl->model && (*task)->cl->model->symbol)?(*task)->cl->model->symbol:"none");
			
 
				+          fprintf(stderr, "submission of task %p wih codelet %p failed (symbol `%s')\n", *task, (*task)->cl, ((*task)->cl->model && (*task)->cl->model->symbol)?(*task)->cl->model->symbol:"none");
			
 
				 
			
 
				-	STARPU_ASSERT(!ret);
			
 
				         return ret;
			
 
				 }